[llvm-branch-commits] [llvm] [SelectionDAG] Fold extracts of subvector inserts (PR #201271)
Krzysztof Drewniak via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Jun 2 23:49:18 PDT 2026
https://github.com/krzysz00 updated https://github.com/llvm/llvm-project/pull/201271
>From 7bebe28727d9c6722239e5da3da4080837bc6bd1 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Sat, 30 May 2026 02:19:01 +0000
Subject: [PATCH] [SelectionDAG] Fold extracts of subvector inserts
Fold extract_subvector(insert_subvector(...)) when the extraction is
outside the inserted subvector or the inserted subvector only amends
the extracted
In particular,
1. vA extract_subvector (vB insert_subvector(vB X, vC Y, C1), C2) =>
vA extract_subvector(X, C2) when [C2, C2 + A) intersect [C1, C1 + C)
is the empty set
2. ... => extract_subvector(Y, C2 - C1) if [C2, C2 + Y) is a subset of
[C1, C1 + C) - an existing simplification
3. ... => vA insert_subvector(vA extract_subvector(vB X, C2), vC Y, C1 - C2)
if [C1, C1 + C) is a subset of [C2, C2 + A) - that is, if you're only
updating the extracted sub-part.
Adds a regresssion tests for an infinite SelectionDAG cycle that is
fixed by a stack of commits that ends with this one.
AI note: an LLM generated the code and the test, I've read them
Co-Authored-By: OpenAI Codex <codex at openai.com>
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 35 ++++-
...agcombine-freeze-extract-subvector-loop.ll | 45 ++++++
.../CodeGen/X86/dagcombine-extract-insert.ll | 47 ++++---
.../vector-interleaved-store-i16-stride-3.ll | 12 +-
.../vector-interleaved-store-i16-stride-6.ll | 92 ++++++-------
.../vector-interleaved-store-i64-stride-6.ll | 128 ++++++++++--------
.../vector-interleaved-store-i8-stride-6.ll | 12 +-
.../CodeGen/X86/vector-replicaton-i1-mask.ll | 32 ++---
8 files changed, 237 insertions(+), 166 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/dagcombine-freeze-extract-subvector-loop.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0dcaeb5b22c9a..2cf455c89a4f4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -27590,20 +27590,41 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
return DAG.getSplatVector(NVT, DL, V.getOperand(0));
// extract_subvector(insert_subvector(x,y,c1),c2)
+ // --> extract_subvector(x,c2)
+ // iff we're extracting wholly outside the inserted subvector.
+ //
// --> extract_subvector(y,c2-c1)
- // iff we're just extracting from the inserted subvector.
+ // iff we're extracting wholly from the inserted subvector.
+ //
+ // --> insert_subvector(extract_subvector(x,c2), y, c1-c2)
+ // iff the inserted subvector is wholly contained by the extraction.
if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
+ SDValue Src = V.getOperand(0);
SDValue InsSub = V.getOperand(1);
EVT InsSubVT = InsSub.getValueType();
unsigned NumInsElts = InsSubVT.getVectorMinNumElements();
unsigned InsIdx = V.getConstantOperandVal(2);
unsigned NumSubElts = NVT.getVectorMinNumElements();
- if (InsIdx <= ExtIdx && (ExtIdx + NumSubElts) <= (InsIdx + NumInsElts) &&
- TLI.isExtractSubvectorCheap(NVT, InsSubVT, ExtIdx - InsIdx) &&
- InsSubVT.isFixedLengthVector() && NVT.isFixedLengthVector() &&
- V.getValueType().isFixedLengthVector())
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, InsSub,
- DAG.getVectorIdxConstant(ExtIdx - InsIdx, DL));
+ if (InsSubVT.isFixedLengthVector() && NVT.isFixedLengthVector() &&
+ V.getValueType().isFixedLengthVector()) {
+ uint64_t ExtEnd = ExtIdx + NumSubElts;
+ uint64_t InsEnd = InsIdx + NumInsElts;
+ if (ExtEnd <= InsIdx || InsEnd <= ExtIdx)
+ return DAG.getExtractSubvector(DL, NVT, Src, ExtIdx);
+
+ if (InsIdx <= ExtIdx && ExtEnd <= InsEnd &&
+ TLI.isExtractSubvectorCheap(NVT, InsSubVT, ExtIdx - InsIdx))
+ return DAG.getExtractSubvector(DL, NVT, InsSub, ExtIdx - InsIdx);
+
+ if (ExtIdx <= InsIdx && InsEnd <= ExtEnd &&
+ InsSubVT.getVectorElementType() == NVT.getVectorElementType() &&
+ (InsIdx - ExtIdx) % NumInsElts == 0 &&
+ hasOperation(ISD::INSERT_SUBVECTOR, NVT)) {
+ SDValue NewExtract = DAG.getExtractSubvector(DL, NVT, Src, ExtIdx);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NVT, NewExtract, InsSub,
+ DAG.getVectorIdxConstant(InsIdx - ExtIdx, DL));
+ }
+ }
}
// Try to move vector bitcast after extract_subv by scaling extraction index:
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-freeze-extract-subvector-loop.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-freeze-extract-subvector-loop.ll
new file mode 100644
index 0000000000000..8e929b55bc1f1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-freeze-extract-subvector-loop.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -O2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+; Regression test for an infinite DAGCombine loop involving freeze sinking
+; through extract_subvector users of this shuffle/select chain.
+; See https://github.com/ROCm/llvm-project/issues/2616 for the original report.
+define amdgpu_kernel void @freeze_loop(<2 x i16> %0, i1 %1) {
+; CHECK-LABEL: freeze_loop:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_bitcmp1_b32 s1, 0
+; CHECK-NEXT: s_cselect_b32 s0, s0, 0x10001
+; CHECK-NEXT: v_mov_b32_e32 v1, s0
+; CHECK-NEXT: ds_write_b32 v0, v1
+; CHECK-NEXT: s_endpgm
+ %3 = shufflevector <2 x i16> %0, <2 x i16> zeroinitializer, <23 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %4 = select i1 %1, <23 x i16> %3, <23 x i16> zeroinitializer
+ %5 = shufflevector <23 x i16> %4, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 23, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+ %6 = select i1 %1, <23 x i16> zeroinitializer, <23 x i16> %5
+ %7 = shufflevector <23 x i16> %6, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 24, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+ %8 = select i1 %1, <23 x i16> zeroinitializer, <23 x i16> %7
+ %9 = shufflevector <23 x i16> %8, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 23, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+ %10 = shufflevector <23 x i16> %4, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 23, i32 poison, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+ %11 = select i1 %1, <23 x i16> zeroinitializer, <23 x i16> %9
+ %12 = shufflevector <23 x i16> %11, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 24, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+ %13 = select i1 %1, <23 x i16> %10, <23 x i16> %12
+ %14 = shufflevector <23 x i16> %13, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 23, i32 poison, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+ %15 = select i1 %1, <23 x i16> %14, <23 x i16> %10
+ %16 = shufflevector <23 x i16> %15, <23 x i16> %3, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 23, i32 poison, i32 18, i32 19, i32 20, i32 21, i32 22>
+ %17 = shufflevector <23 x i16> %15, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+ %18 = select i1 %1, <23 x i16> %16, <23 x i16> %17
+ %19 = shufflevector <23 x i16> %18, <23 x i16> %3, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 23, i32 poison, i32 20, i32 21, i32 22>
+ %20 = shufflevector <23 x i16> %13, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 23, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+ %21 = select i1 %1, <23 x i16> %19, <23 x i16> %20
+ %22 = shufflevector <23 x i16> %21, <23 x i16> %3, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 23, i32 poison, i32 22>
+ %23 = select i1 %1, <23 x i16> %22, <23 x i16> splat (i16 1)
+ %24 = shufflevector <23 x i16> %23, <23 x i16> zeroinitializer, <2 x i32> <i32 20, i32 21>
+ store <2 x i16> %24, ptr addrspace(3) null, align 2
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/dagcombine-extract-insert.ll b/llvm/test/CodeGen/X86/dagcombine-extract-insert.ll
index c0595fce4117d..11c33a0490a67 100644
--- a/llvm/test/CodeGen/X86/dagcombine-extract-insert.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-extract-insert.ll
@@ -7,30 +7,33 @@ define void @extract_insert_interleaved_store(ptr %in.vecptr0, ptr %in.vecptr1,
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; CHECK-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,0,8,u,u,1,9,1,9,u,u,2,10,2,10,u,u,3,11,3,11,u,u,4,12,4,12,u,u,5,13]
-; CHECK-NEXT: vpshufb %ymm4, %ymm3, %ymm5
-; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
-; CHECK-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; CHECK-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15]
-; CHECK-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u]
-; CHECK-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
-; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; CHECK-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u]
-; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u]
-; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; CHECK-NEXT: vmovdqa %xmm0, 32(%rax)
-; CHECK-NEXT: vmovdqa %ymm3, (%rax)
+; CHECK-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; CHECK-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3]
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm6 = [5,13,u,u,6,14,6,14,u,u,7,15,7,15,u,u]
+; CHECK-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; CHECK-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,2,3]
+; CHECK-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm5[0]
+; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7]
+; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; CHECK-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,ymm2[2,10],zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,ymm2[20,28],zero,zero,zero,zero
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,ymm0[2,10],zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,ymm0[20,28],zero,zero,zero,zero,ymm0[21,29]
+; CHECK-NEXT: vpor %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm2
+; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; CHECK-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u]
+; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; CHECK-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: vmovdqa %ymm0, (%rax)
+; CHECK-NEXT: vmovdqa %xmm1, 32(%rax)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
index 7dbff047e4f87..6967b87a47b81 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
@@ -543,9 +543,8 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1
; AVX512-NEXT: vmovdqa %xmm0, 32(%rcx)
-; AVX512-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512-NEXT: vmovdqa %ymm4, (%rcx)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -564,9 +563,8 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1
; AVX512-FCP-NEXT: vmovdqa %xmm0, 32(%rcx)
-; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rcx)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -588,9 +586,8 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1
; AVX512DQ-NEXT: vmovdqa %xmm0, 32(%rcx)
-; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512DQ-NEXT: vmovdqa %ymm4, (%rcx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -609,9 +606,8 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 32(%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rcx)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
index bc7ed7552e77c..80b11e572050f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
@@ -489,28 +489,27 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX512-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm8 = [0,3,7,0]
-; AVX512-NEXT: vpermi2d %xmm6, %xmm7, %xmm8
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm8[1,2],xmm6[3]
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29]
-; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
-; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
-; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm1
-; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX512-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa %xmm6, 32(%rax)
-; AVX512-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6
+; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29]
+; AVX512-NEXT: vpshufb %ymm7, %ymm6, %ymm6
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8
+; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2]
+; AVX512-NEXT: vpshufb %ymm7, %ymm8, %ymm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm7
+; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,0,2]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,3,7,0]
+; AVX512-NEXT: vpermi2d %xmm2, %xmm0, %xmm1
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; AVX512-NEXT: vmovdqa %xmm0, 32(%rax)
+; AVX512-NEXT: vmovdqa %ymm6, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -543,9 +542,8 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,5,6,3]
; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm0
; AVX512-FCP-NEXT: vmovdqa %xmm2, 32(%rax)
-; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -558,28 +556,27 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm8 = [0,3,7,0]
-; AVX512DQ-NEXT: vpermi2d %xmm6, %xmm7, %xmm8
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm8[1,2],xmm6[3]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29]
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm1
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm0
-; AVX512DQ-NEXT: vmovdqa %xmm6, 32(%rax)
-; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29]
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm6, %ymm6
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2]
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm8, %ymm7
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm7
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,0,2]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,3,7,0]
+; AVX512DQ-NEXT: vpermi2d %xmm2, %xmm0, %xmm1
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; AVX512DQ-NEXT: vmovdqa %xmm0, 32(%rax)
+; AVX512DQ-NEXT: vmovdqa %ymm6, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -612,9 +609,8 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,5,6,3]
; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 32(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
index cba11be4d8456..3d5a6a36377f7 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
@@ -135,16 +135,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vmovdqa (%rdx), %xmm1
; AVX512-NEXT: vmovdqa (%r8), %xmm2
+; AVX512-NEXT: vmovdqa (%r9), %xmm3
; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
-; AVX512-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
-; AVX512-NEXT: vmovdqa %ymm0, 64(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4
+; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3]
+; AVX512-NEXT: vpermi2q %zmm4, %zmm0, %zmm5
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0
+; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7]
+; AVX512-NEXT: vpermi2q %ymm0, %ymm1, %ymm2
+; AVX512-NEXT: vmovdqa %ymm2, 64(%rax)
+; AVX512-NEXT: vmovdqa64 %zmm5, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -154,16 +156,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2
+; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm3
; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
-; AVX512-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
-; AVX512-FCP-NEXT: vmovdqa %ymm0, 64(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3]
+; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm5
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7]
+; AVX512-FCP-NEXT: vpermi2q %ymm0, %ymm1, %ymm2
+; AVX512-FCP-NEXT: vmovdqa %ymm2, 64(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -173,16 +177,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2
+; AVX512DQ-NEXT: vmovdqa (%r9), %xmm3
; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
-; AVX512DQ-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4
+; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3]
+; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm0, %zmm5
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0
+; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7]
+; AVX512DQ-NEXT: vpermi2q %ymm0, %ymm1, %ymm2
+; AVX512DQ-NEXT: vmovdqa %ymm2, 64(%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -192,16 +198,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm3
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
-; AVX512DQ-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 64(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm5
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7]
+; AVX512DQ-FCP-NEXT: vpermi2q %ymm0, %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, 64(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -211,16 +219,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1
; AVX512BW-NEXT: vmovdqa (%r8), %xmm2
+; AVX512BW-NEXT: vmovdqa (%r9), %xmm3
; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
-; AVX512BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4
+; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3]
+; AVX512BW-NEXT: vpermi2q %zmm4, %zmm0, %zmm5
+; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0
+; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7]
+; AVX512BW-NEXT: vpermi2q %ymm0, %ymm1, %ymm2
+; AVX512BW-NEXT: vmovdqa %ymm2, 64(%rax)
+; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -230,16 +240,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2
+; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm3
; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
-; AVX512BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm5
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7]
+; AVX512BW-FCP-NEXT: vpermi2q %ymm0, %ymm1, %ymm2
+; AVX512BW-FCP-NEXT: vmovdqa %ymm2, 64(%rax)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -249,16 +261,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2
+; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm3
; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
-; AVX512DQ-BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT: vmovdqa %ymm0, 64(%rax)
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4
+; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3]
+; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm0, %zmm5
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0
+; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7]
+; AVX512DQ-BW-NEXT: vpermi2q %ymm0, %ymm1, %ymm2
+; AVX512DQ-BW-NEXT: vmovdqa %ymm2, 64(%rax)
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -268,16 +282,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm3
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm0, %ymm1, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <2 x i64>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
index d6c65fa82fa6c..f78700d8cd9a1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
@@ -945,9 +945,8 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512BW-NEXT: movw $18724, %cx # imm = 0x4924
; AVX512BW-NEXT: kmovd %ecx, %k1
; AVX512BW-NEXT: vmovdqu16 %ymm2, %ymm0 {%k1}
-; AVX512BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa %xmm1, 32(%rax)
; AVX512BW-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512BW-NEXT: vmovdqa %xmm1, 32(%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -982,9 +981,8 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512BW-FCP-NEXT: movw $18724, %cx # imm = 0x4924
; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
; AVX512BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm0 {%k1}
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax)
; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -1019,9 +1017,8 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-BW-NEXT: movw $18724, %cx # imm = 0x4924
; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
; AVX512DQ-BW-NEXT: vmovdqu16 %ymm2, %ymm0 {%k1}
-; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT: vmovdqa %xmm1, 32(%rax)
; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512DQ-BW-NEXT: vmovdqa %xmm1, 32(%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -1056,9 +1053,8 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-BW-FCP-NEXT: movw $18724, %cx # imm = 0x4924
; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm0 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax)
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
index 76dfd019c0883..aaeab617d56c8 100644
--- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
@@ -3207,47 +3207,45 @@ define void @mask_replication_factor6_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-ONLY-NEXT: vpmovm2w %k1, %zmm0
; AVX512BW-ONLY-NEXT: vpmovm2w %k0, %zmm1
; AVX512BW-ONLY-NEXT: movl $1, %eax
-; AVX512BW-ONLY-NEXT: kmovd %eax, %k2
-; AVX512BW-ONLY-NEXT: vmovdqu16 %zmm0, %zmm1 {%k2}
-; AVX512BW-ONLY-NEXT: vpmovw2m %zmm1, %k2
-; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2
-; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z}
-; AVX512BW-ONLY-NEXT: vpmovm2w %k1, %zmm2
+; AVX512BW-ONLY-NEXT: kmovd %eax, %k1
+; AVX512BW-ONLY-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-NEXT: vpmovw2m %zmm1, %k1
+; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1
+; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z}
; AVX512BW-ONLY-NEXT: vpmovsxbw {{.*#+}} zmm3 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
-; AVX512BW-ONLY-NEXT: vpermw %zmm2, %zmm3, %zmm2
-; AVX512BW-ONLY-NEXT: vpmovw2m %zmm2, %k1
+; AVX512BW-ONLY-NEXT: vpermw %zmm0, %zmm3, %zmm0
+; AVX512BW-ONLY-NEXT: vpmovw2m %zmm0, %k1
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
-; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k2} {z}
+; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm0 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k1} {z}
; AVX512BW-ONLY-NEXT: kshiftrq $48, %k0, %k1
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k1} {z}
; AVX512BW-ONLY-NEXT: kshiftrq $32, %k0, %k1
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z}
-; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx)
+; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 128(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 192(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 256(%rdx)
-; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 320(%rdx)
-; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 320(%rdx)
+; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512BW-ONLY-NEXT: vzeroupper
; AVX512BW-ONLY-NEXT: retq
;
; AVX512VBMI-ONLY-LABEL: mask_replication_factor6_vf16:
; AVX512VBMI-ONLY: # %bb.0:
-; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k2
-; AVX512VBMI-ONLY-NEXT: vpmovm2b %k2, %zmm0
+; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k1
+; AVX512VBMI-ONLY-NEXT: vpmovm2b %k1, %zmm0
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7,8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k0
; AVX512VBMI-ONLY-NEXT: vpmovm2w %k0, %zmm0
-; AVX512VBMI-ONLY-NEXT: vpmovm2w %k2, %zmm1
+; AVX512VBMI-ONLY-NEXT: vpmovm2w %k1, %zmm1
; AVX512VBMI-ONLY-NEXT: movl $1, %eax
; AVX512VBMI-ONLY-NEXT: kmovd %eax, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512VBMI-ONLY-NEXT: vpmovw2m %zmm0, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT: vpmovm2w %k2, %zmm1
; AVX512VBMI-ONLY-NEXT: vpmovsxbw {{.*#+}} zmm2 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
; AVX512VBMI-ONLY-NEXT: vpermw %zmm1, %zmm2, %zmm1
; AVX512VBMI-ONLY-NEXT: vpmovw2m %zmm1, %k2
More information about the llvm-branch-commits
mailing list