[llvm] [X86] combineINSERT_SUBVECTOR - fold insert_subvector(base,extract_subvector(broadcast)) -> blend shuffle(base,broadcast) (REAPPLIED) (PR #133724)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 31 07:09:13 PDT 2025
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/133724
If the broadcast is already the full vector width, try to prefer a blend/vshuff64x2 over a vector insertion which is usually lower latency (and sometimes a lower uop count), and reduces changes in vector sizes that can interfere with further combines.
Updated version of #133083 - which lead to infinite loops due to shuffle lowering recreating the INSERT_SUBVECTOR pattern, this variant creates the BLENDI/SHUF128 nodes directly.
>From 67f08dcd383c3b5ea687fa73093e3cbaccbb88d1 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 31 Mar 2025 15:08:10 +0100
Subject: [PATCH] [X86] combineINSERT_SUBVECTOR - fold
insert_subvector(base,extract_subvector(broadcast)) -> blend
shuffle(base,broadcast) (REAPPLIED)
If the broadcast is already the full vector width, try to prefer a blend/vshuff64x2 over a vector insertion which is usually lower latency (and sometimes a lower uop count), and reduces changes in vector sizes that can interfere with further combines.
Updated version of #133083 - which lead to infinite loops due to shuffle lowering recreating the INSERT_SUBVECTOR pattern, this variant creates the BLENDI/SHUF128 nodes directly.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 24 +++++++++++++++++++
.../any_extend_vector_inreg_of_broadcast.ll | 24 +++++++++----------
...d_vector_inreg_of_broadcast_from_memory.ll | 16 ++++++-------
.../CodeGen/X86/insert-subvector-broadcast.ll | 2 +-
.../vector-interleaved-store-i32-stride-5.ll | 2 +-
5 files changed, 46 insertions(+), 22 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 76de7e888d985..5fff78f7a173a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58896,6 +58896,30 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask);
}
+ // If we're broadcasting, see if we can use a blend instead of
+ // extract/insert pair. Ensure that the subvector is aligned with the
+ // insertion/extractions.
+ if ((ExtIdxVal % SubVecNumElts) == 0 && (IdxVal % SubVecNumElts) == 0 &&
+ (ExtSrc.getOpcode() == X86ISD::VBROADCAST ||
+ ExtSrc.getOpcode() == X86ISD::VBROADCAST_LOAD ||
+ (ExtSrc.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
+ cast<MemIntrinsicSDNode>(ExtSrc)->getMemoryVT() == SubVecVT))) {
+ if (OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
+ uint64_t BlendMask = IdxVal == 0 ? 0x0F : 0xF0;
+ SDValue Blend = DAG.getNode(
+ X86ISD::BLENDI, dl, MVT::v8f32, DAG.getBitcast(MVT::v8f32, Vec),
+ DAG.getBitcast(MVT::v8f32, ExtSrc),
+ DAG.getTargetConstant(BlendMask, dl, MVT::i8));
+ return DAG.getBitcast(OpVT, Blend);
+ } else if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) {
+ SDValue Lo = DAG.getBitcast(MVT::v8f64, IdxVal == 0 ? ExtSrc : Vec);
+ SDValue Hi = DAG.getBitcast(MVT::v8f64, IdxVal == 0 ? Vec : ExtSrc);
+ SDValue Shuffle =
+ DAG.getNode(X86ISD::SHUF128, dl, MVT::v8f64, Lo, Hi,
+ getV4X86ShuffleImm8ForMask({0, 1, 2, 3}, dl, DAG));
+ return DAG.getBitcast(OpVT, Shuffle);
+ }
+ }
}
// Match concat_vector style patterns.
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index 8125e062e7ffd..6f4e7abda8b00 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -2573,7 +2573,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
@@ -2590,7 +2590,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
@@ -2835,7 +2835,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
@@ -2852,7 +2852,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
@@ -2868,7 +2868,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -3096,7 +3096,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
@@ -3113,7 +3113,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
@@ -3129,7 +3129,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -3612,7 +3612,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
@@ -3628,7 +3628,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
@@ -3864,7 +3864,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
@@ -3880,7 +3880,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 305509ca7fc3f..52f856befa130 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -2239,7 +2239,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -2253,7 +2253,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -2267,7 +2267,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -2458,7 +2458,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -2472,7 +2472,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -2486,7 +2486,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -3095,7 +3095,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
@@ -3107,7 +3107,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
diff --git a/llvm/test/CodeGen/X86/insert-subvector-broadcast.ll b/llvm/test/CodeGen/X86/insert-subvector-broadcast.ll
index 47cd752ef80a4..9b35857804022 100644
--- a/llvm/test/CodeGen/X86/insert-subvector-broadcast.ll
+++ b/llvm/test/CodeGen/X86/insert-subvector-broadcast.ll
@@ -9,7 +9,7 @@ define void @insert_subvector_broadcast_as_blend() {
; CHECK-NEXT: vpbroadcastq %rax, %zmm0
; CHECK-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
-; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm0[4,5,6,7]
; CHECK-NEXT: vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k0
; CHECK-NEXT: vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k1
; CHECK-NEXT: kunpckbw %k0, %k1, %k1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
index e88a651d29cef..89330122fa239 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
@@ -350,7 +350,7 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6],ymm5[7]
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm5 = mem[0,1,0,1]
-; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7
+; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3],ymm7[4],ymm4[5,6,7]
; AVX-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,u,u,2,u,u,u,7]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7]
More information about the llvm-commits
mailing list