[llvm] [X86] combineEXTRACT_SUBVECTOR - extract from a larger subvector insertion (PR #132950)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 25 09:03:39 PDT 2025
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/132950
Fold EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2) -> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2)
This extends the existing fold which required the extract/insert subvector indices to match - now it will always extract as long as the original inserted subvector is entirely contained within the extraction.
Helps avoid unnecessary use of 512-bit vectors, and improves the chance of concatenation folds.
>From ba52b8351e85b6f63607e3a56e0c1ac2d6f8ea49 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 25 Mar 2025 16:02:00 +0000
Subject: [PATCH] [X86] combineEXTRACT_SUBVECTOR - extract from a larger
subvector insertion
Fold EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2) -> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2)
This extends the existing fold which required the extract/insert subvector indices to match - now it will always extract as long as the original inserted subvector is entirely contained within the extraction.
Helps avoid unnecessary use of 512-bit vectors, and improves the chance of concatenation folds.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 30 +++++----
.../vector-interleaved-store-i16-stride-6.ll | 8 +--
.../vector-interleaved-store-i8-stride-3.ll | 64 +++++++++----------
.../vector-interleaved-store-i8-stride-6.ll | 12 ++--
llvm/test/CodeGen/X86/widen_fadd.ll | 16 ++---
llvm/test/CodeGen/X86/widen_fdiv.ll | 46 ++++++-------
llvm/test/CodeGen/X86/widen_fmul.ll | 16 ++---
llvm/test/CodeGen/X86/widen_fsub.ll | 16 ++---
.../CodeGen/X86/x86-interleaved-access.ll | 8 +--
9 files changed, 109 insertions(+), 107 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 540b176e0d77f..b128a6dadbbb6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -59085,20 +59085,22 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
}
- // If we are extracting from an insert into a larger vector, replace with a
- // smaller insert if we don't access less than the original subvector. Don't
- // do this for i1 vectors.
- // TODO: Relax the matching indices requirement?
- if (VT.getVectorElementType() != MVT::i1 &&
- InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
- IdxVal == InVec.getConstantOperandVal(2) &&
- InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
- SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
- InVec.getOperand(0), N->getOperand(1));
- unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
- return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
- InVec.getOperand(1),
- DAG.getVectorIdxConstant(NewIdxVal, DL));
+ // EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2)
+ // --> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2)
+ // iff SUB is entirely contained in the extraction.
+ if (VT.getVectorElementType() != MVT::i1 && TLI.isTypeLegal(VT) &&
+ InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse()) {
+ SDValue Src = InVec.getOperand(0);
+ SDValue Sub = InVec.getOperand(1);
+ EVT SubVT = Sub.getValueType();
+ uint64_t InsIdx = InVec.getConstantOperandVal(2);
+ if (IdxVal <= InsIdx &&
+ (IdxVal + NumSubElts) >= (InsIdx + SubVT.getVectorNumElements())) {
+ SDValue NewSrc = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
+ DAG.getVectorIdxConstant(IdxVal, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewSrc, Sub,
+ DAG.getVectorIdxConstant(InsIdx - IdxVal, DL));
+ }
}
// If we're extracting an upper subvector from a broadcast we should just
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
index a5813a60df4f2..33c57f2edf06e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
@@ -993,13 +993,13 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vmovdqa (%r8), %xmm2
; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
-; AVX512-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,1,3]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,6,7,14,15,2,3,10,11,16,17,24,25,16,17,24,25,24,25,26,27,18,19,26,27]
; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3]
; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3]
; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
@@ -1035,7 +1035,6 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2
; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
-; AVX512-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5]
; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3
@@ -1044,6 +1043,7 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3]
; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
@@ -1085,13 +1085,13 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2
; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
-; AVX512DQ-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,1,3]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,6,7,14,15,2,3,10,11,16,17,24,25,16,17,24,25,24,25,26,27,18,19,26,27]
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3]
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
@@ -1127,7 +1127,6 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5]
; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3
@@ -1136,6 +1135,7 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3]
; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
index 7f2210742e7f1..8802e8a779332 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
@@ -740,14 +740,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; AVX512-NEXT: vmovdqa %xmm2, 32(%rcx)
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -763,14 +763,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; AVX512-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
@@ -786,14 +786,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vmovdqa %xmm2, 32(%rcx)
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -809,14 +809,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
@@ -832,14 +832,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vmovdqa %xmm2, 32(%rcx)
+; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512BW-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -855,14 +855,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; AVX512BW-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
@@ -878,14 +878,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; AVX512DQ-BW-NEXT: vmovdqa %xmm2, 32(%rcx)
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
@@ -901,14 +901,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
index 483de15d52276..6f48e3223bd5a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
@@ -1389,7 +1389,6 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vmovdqa (%r8), %xmm2
; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
-; AVX512-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2]
; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u],zero,zero,ymm3[1,9,u,u],zero,zero,ymm3[2,10,u,u],zero,zero,ymm3[19,27,u,u],zero,zero,ymm3[20,28,u,u],zero,zero
; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3]
@@ -1401,6 +1400,7 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[5,13,u,u],zero,zero,ymm5[6,14,u,u],zero,zero,ymm5[7,15,u,u],zero,zero,ymm5[16,24,u,u],zero,zero,ymm5[17,25,u,u],zero,zero,ymm5[18,26]
; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
; AVX512-NEXT: vporq %zmm3, %zmm4, %zmm3
+; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2]
; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[0,8],zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,ymm4[18,26],zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,ymm4[20,28],zero,zero
; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3]
@@ -1428,7 +1428,6 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2
; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
-; AVX512-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u],zero,zero,ymm3[1,9,u,u],zero,zero,ymm3[2,10,u,u],zero,zero,ymm3[19,27,u,u],zero,zero,ymm3[20,28,u,u],zero,zero
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3]
@@ -1440,6 +1439,7 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[5,13,u,u],zero,zero,ymm5[6,14,u,u],zero,zero,ymm5[7,15,u,u],zero,zero,ymm5[16,24,u,u],zero,zero,ymm5[17,25,u,u],zero,zero,ymm5[18,26]
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
; AVX512-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3
+; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[0,8],zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,ymm4[18,26],zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,ymm4[20,28],zero,zero
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3]
@@ -1467,7 +1467,6 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2
; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
-; AVX512DQ-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u],zero,zero,ymm3[1,9,u,u],zero,zero,ymm3[2,10,u,u],zero,zero,ymm3[19,27,u,u],zero,zero,ymm3[20,28,u,u],zero,zero
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3]
@@ -1479,6 +1478,7 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[5,13,u,u],zero,zero,ymm5[6,14,u,u],zero,zero,ymm5[7,15,u,u],zero,zero,ymm5[16,24,u,u],zero,zero,ymm5[17,25,u,u],zero,zero,ymm5[18,26]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
; AVX512DQ-NEXT: vporq %zmm3, %zmm4, %zmm3
+; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[0,8],zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,ymm4[18,26],zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,ymm4[20,28],zero,zero
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3]
@@ -1506,7 +1506,6 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u],zero,zero,ymm3[1,9,u,u],zero,zero,ymm3[2,10,u,u],zero,zero,ymm3[19,27,u,u],zero,zero,ymm3[20,28,u,u],zero,zero
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3]
@@ -1518,6 +1517,7 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[5,13,u,u],zero,zero,ymm5[6,14,u,u],zero,zero,ymm5[7,15,u,u],zero,zero,ymm5[16,24,u,u],zero,zero,ymm5[17,25,u,u],zero,zero,ymm5[18,26]
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
; AVX512DQ-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[0,8],zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,ymm4[18,26],zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,ymm4[20,28],zero,zero
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3]
@@ -1545,7 +1545,6 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vmovdqa (%r8), %xmm2
; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,0,2]
; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,1,3]
; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
@@ -1555,6 +1554,7 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
; AVX512BW-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zmm4[0,8],zero,zero,zero,zero,zmm4[1,9],zero,zero,zero,zero,zmm4[2,10],zero,zero,zero,zero,zmm4[19,27],zero,zero,zero,zero,zmm4[20,28],zero,zero,zero,zero,zero,zero,zero,zero,zmm4[38,46],zero,zero,zero,zero,zmm4[39,47],zero,zero,zero,zero,zmm4[48,56],zero,zero,zero,zero,zmm4[49,57],zero,zero,zero,zero,zmm4[50,58],zero,zero
; AVX512BW-NEXT: vporq %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2]
; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3]
; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
@@ -1623,7 +1623,6 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2
; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
-; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,0,2]
; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,1,3]
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
@@ -1633,6 +1632,7 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zmm4[0,8],zero,zero,zero,zero,zmm4[1,9],zero,zero,zero,zero,zmm4[2,10],zero,zero,zero,zero,zmm4[19,27],zero,zero,zero,zero,zmm4[20,28],zero,zero,zero,zero,zero,zero,zero,zero,zmm4[38,46],zero,zero,zero,zero,zmm4[39,47],zero,zero,zero,zero,zmm4[48,56],zero,zero,zero,zero,zmm4[49,57],zero,zero,zero,zero,zmm4[50,58],zero,zero
; AVX512DQ-BW-NEXT: vporq %zmm3, %zmm4, %zmm3
+; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2]
; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3]
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
diff --git a/llvm/test/CodeGen/X86/widen_fadd.ll b/llvm/test/CodeGen/X86/widen_fadd.ll
index 825ee34561c79..c3700189d3d0e 100644
--- a/llvm/test/CodeGen/X86/widen_fadd.ll
+++ b/llvm/test/CodeGen/X86/widen_fadd.ll
@@ -252,16 +252,16 @@ define void @widen_fadd_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
; AVX512VL-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
-; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
-; AVX512VL-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512VL-NEXT: vpermi2pd %zmm6, %zmm4, %zmm5
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,6]
; AVX512VL-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
-; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm0
+; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm0
+; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm2
+; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,8,10,0,2,8,10]
+; AVX512VL-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3
+; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm0
; AVX512VL-NEXT: vmovupd %zmm0, (%rdx)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/widen_fdiv.ll b/llvm/test/CodeGen/X86/widen_fdiv.ll
index c0ec41237d301..fd251a99ca500 100644
--- a/llvm/test/CodeGen/X86/widen_fdiv.ll
+++ b/llvm/test/CodeGen/X86/widen_fdiv.ll
@@ -194,35 +194,35 @@ define void @widen_fdiv_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vdivps %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vdivps %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vdivps %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vdivps %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512VL-NEXT: vdivps %xmm5, %xmm4, %xmm4
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
-; AVX512VL-NEXT: vdivps %xmm6, %xmm5, %xmm5
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
-; AVX512VL-NEXT: vdivps %xmm7, %xmm6, %xmm6
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
-; AVX512VL-NEXT: vdivps %xmm8, %xmm7, %xmm7
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
-; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
-; AVX512VL-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512VL-NEXT: vpermi2pd %zmm6, %zmm4, %zmm5
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovsd {{.*#+}} xmm9 = mem[0],zero
+; AVX512VL-NEXT: vdivps %xmm9, %xmm8, %xmm8
+; AVX512VL-NEXT: vmovsd {{.*#+}} xmm9 = mem[0],zero
+; AVX512VL-NEXT: vmovsd {{.*#+}} xmm10 = mem[0],zero
+; AVX512VL-NEXT: vdivps %xmm10, %xmm9, %xmm9
+; AVX512VL-NEXT: vmovsd {{.*#+}} xmm10 = mem[0],zero
+; AVX512VL-NEXT: vmovsd {{.*#+}} xmm11 = mem[0],zero
+; AVX512VL-NEXT: vdivps %xmm11, %xmm10, %xmm10
+; AVX512VL-NEXT: vmovsd {{.*#+}} xmm11 = mem[0],zero
+; AVX512VL-NEXT: vmovsd {{.*#+}} xmm12 = mem[0],zero
+; AVX512VL-NEXT: vdivps %xmm12, %xmm11, %xmm11
+; AVX512VL-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
+; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vdivps %ymm6, %ymm2, %ymm2
+; AVX512VL-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm3
+; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vdivps %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,6]
; AVX512VL-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
-; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm0
+; AVX512VL-NEXT: vinsertf32x4 $1, %xmm11, %zmm10, %zmm0
+; AVX512VL-NEXT: vinsertf32x4 $1, %xmm9, %zmm8, %zmm2
+; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,8,10,0,2,8,10]
+; AVX512VL-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3
+; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm0
; AVX512VL-NEXT: vmovupd %zmm0, (%rdx)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/widen_fmul.ll b/llvm/test/CodeGen/X86/widen_fmul.ll
index 7011419fbc6fc..6c3e0ff5a9bcd 100644
--- a/llvm/test/CodeGen/X86/widen_fmul.ll
+++ b/llvm/test/CodeGen/X86/widen_fmul.ll
@@ -252,16 +252,16 @@ define void @widen_fmul_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
; AVX512VL-NEXT: vmulps %xmm7, %xmm8, %xmm7
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
-; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
-; AVX512VL-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512VL-NEXT: vpermi2pd %zmm6, %zmm4, %zmm5
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,6]
; AVX512VL-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
-; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm0
+; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm0
+; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm2
+; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,8,10,0,2,8,10]
+; AVX512VL-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3
+; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm0
; AVX512VL-NEXT: vmovupd %zmm0, (%rdx)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/widen_fsub.ll b/llvm/test/CodeGen/X86/widen_fsub.ll
index 915f122b50386..7405d9b7b1c65 100644
--- a/llvm/test/CodeGen/X86/widen_fsub.ll
+++ b/llvm/test/CodeGen/X86/widen_fsub.ll
@@ -252,16 +252,16 @@ define void @widen_fsub_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
; AVX512VL-NEXT: vsubps %xmm8, %xmm7, %xmm7
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
-; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
-; AVX512VL-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512VL-NEXT: vpermi2pd %zmm6, %zmm4, %zmm5
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,6]
; AVX512VL-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
-; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm0
+; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm0
+; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm2
+; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,8,10,0,2,8,10]
+; AVX512VL-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3
+; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm0
; AVX512VL-NEXT: vmovupd %zmm0, (%rdx)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index 0d9d1963861f8..a01e6ca4b175d 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -962,14 +962,14 @@ define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; AVX512-NEXT: vmovdqu %xmm2, 32(%rdi)
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512-NEXT: vmovdqu %ymm0, (%rdi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
More information about the llvm-commits
mailing list