[llvm] [X86] lowerShuffleAsBroadcast - use isShuffleEquivalent to search for a hidden broadcast pattern (PR #126517)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 10 06:09:39 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
lowerShuffleAsBroadcast only matches a known-splat shuffle mask, but we can use the isShuffleEquivalent/IsElementEquivalent helpers to attempt to find a hidden broadcast-able shuffle pattern.
This requires an extension to IsElementEquivalent to peek through bitcasts to match against wider shuffles - these typically appear during shuffle lowering where we've widened a preceding shuffle, often to a vector concatenation etc.
Amazingly I hit this while yak shaving #<!-- -->126033 .......
---
Patch is 105.52 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/126517.diff
13 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+25-2)
- (modified) llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll (+39-52)
- (modified) llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll (+49-58)
- (modified) llvm/test/CodeGen/X86/avx512fp16-mov.ll (+2-8)
- (modified) llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll (+5-6)
- (modified) llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll (+5-3)
- (modified) llvm/test/CodeGen/X86/matrix-multiply.ll (+24-24)
- (modified) llvm/test/CodeGen/X86/pr51615.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/vector-half-conversions.ll (+27-49)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll (+6-6)
- (modified) llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll (+87-88)
- (modified) llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll (+60-72)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 995b4de12ce12c2..4ad400b43434dc8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -9799,6 +9799,24 @@ static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
MaskSize == (int)ExpectedOp.getNumOperands())
return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
break;
+ case ISD::BITCAST:
+ if (Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize) {
+ SDValue Src = peekThroughBitcasts(Op);
+ EVT SrcVT = Src.getValueType();
+ if (SrcVT.isVector() &&
+ (SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
+ unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
+ return (Idx % Scale) == (ExpectedIdx % Scale) &&
+ IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
+ Idx / Scale, ExpectedIdx / Scale);
+ }
+ }
+ break;
+ case ISD::VECTOR_SHUFFLE: {
+ auto *SVN = cast<ShuffleVectorSDNode>(Op);
+ return Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize &&
+ SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
+ }
case X86ISD::VBROADCAST:
case X86ISD::VBROADCAST_LOAD:
// TODO: Handle MaskSize != VT.getVectorNumElements()?
@@ -12779,8 +12797,13 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
// Check that the mask is a broadcast.
int BroadcastIdx = getSplatIndex(Mask);
- if (BroadcastIdx < 0)
- return SDValue();
+ if (BroadcastIdx < 0) {
+ // Check for hidden broadcast.
+ SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
+ if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
+ return SDValue();
+ BroadcastIdx = 0;
+ }
assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast "
"comes from V1.");
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index cad1d09f11d9c3a..4c4d5cb3166a812 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -1220,7 +1220,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
+; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -1234,7 +1234,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1
+; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1247,7 +1247,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX512DQ-NEXT: vpbroadcastd %xmm1, %ymm1
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1259,10 +1259,9 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm0
; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
+; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1}
; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
@@ -1345,7 +1344,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -1359,7 +1358,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1372,7 +1371,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1384,10 +1383,9 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm0
; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
+; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1}
; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
@@ -1719,7 +1717,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1732,7 +1730,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1
; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1745,7 +1743,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
@@ -2691,14 +2689,13 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
-; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -2708,10 +2705,9 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm2
-; AVX512F-NEXT: vpternlogd {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
+; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
+; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
@@ -2724,10 +2720,9 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512DQ-NEXT: vpbroadcastd %xmm0, %ymm2
-; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
+; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
@@ -2739,11 +2734,10 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm2
+; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
-; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -2959,14 +2953,13 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastq %xmm0, %ymm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -2976,10 +2969,9 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2
-; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
+; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
+; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
@@ -2992,10 +2984,9 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
@@ -3007,11 +2998,10 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2
+; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
-; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -3742,14 +3732,13 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm2
; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
-; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
-; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
+; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -3759,9 +3748,8 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2
-; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
@@ -3775,9 +3763,8 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 3d72319f59ca9ec..4d3906c2297639f 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -910,10 +910,9 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
@@ -1011,7 +1010,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e
; AVX2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
@@ -1022,8 +1021,8 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e
; AVX512F-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
-; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem))
+; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
+; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512F-NEXT: vzeroupper
@@ -1032,8 +1031,8 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e
; AVX512...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/126517
More information about the llvm-commits
mailing list