[llvm-branch-commits] [llvm] [SelectionDAG] Fold extracts of subvector inserts (PR #201271)
Krzysztof Drewniak via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Jun 3 10:58:18 PDT 2026
https://github.com/krzysz00 updated https://github.com/llvm/llvm-project/pull/201271
>From 6139ab9b9f68a0a108e88c82fa10bd493995df3a Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Sat, 30 May 2026 02:19:01 +0000
Subject: [PATCH] [SelectionDAG] Fold extracts of subvector inserts
Fold extract_subvector(insert_subvector(...)) when the extraction is
outside the inserted subvector or the inserted subvector only amends
the extracted
In particular,
1. vA extract_subvector (vB insert_subvector(vB X, vC Y, C1), C2) =>
vA extract_subvector(X, C2) when [C2, C2 + A) intersect [C1, C1 + C)
is the empty set
2. ... => extract_subvector(Y, C2 - C1) if [C2, C2 + Y) is a subset of
[C1, C1 + C) - an existing simplification
3. ... => vA insert_subvector(vA extract_subvector(vB X, C2), vC Y, C1 - C2)
if [C1, C1 + C) is a subset of [C2, C2 + A) - that is, if you're only
updating the extracted sub-part.
Adds a regresssion tests for an infinite SelectionDAG cycle that is
fixed by a stack of commits that ends with this one.
AI note: an LLM generated the code and the test, I've read them
Co-Authored-By: OpenAI Codex <codex at openai.com>
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 35 ++++-
...agcombine-freeze-extract-subvector-loop.ll | 45 ++++++
.../vector-interleaved-store-i16-stride-3.ll | 12 +-
.../vector-interleaved-store-i16-stride-6.ll | 92 ++++++-------
.../vector-interleaved-store-i64-stride-6.ll | 128 ++++++++++--------
.../vector-interleaved-store-i8-stride-6.ll | 12 +-
.../CodeGen/X86/vector-replicaton-i1-mask.ll | 32 ++---
7 files changed, 212 insertions(+), 144 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/dagcombine-freeze-extract-subvector-loop.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0dcaeb5b22c9a..2cf455c89a4f4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -27590,20 +27590,41 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
return DAG.getSplatVector(NVT, DL, V.getOperand(0));
// extract_subvector(insert_subvector(x,y,c1),c2)
+ // --> extract_subvector(x,c2)
+ // iff we're extracting wholly outside the inserted subvector.
+ //
// --> extract_subvector(y,c2-c1)
- // iff we're just extracting from the inserted subvector.
+ // iff we're extracting wholly from the inserted subvector.
+ //
+ // --> insert_subvector(extract_subvector(x,c2), y, c1-c2)
+ // iff the inserted subvector is wholly contained by the extraction.
if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
+ SDValue Src = V.getOperand(0);
SDValue InsSub = V.getOperand(1);
EVT InsSubVT = InsSub.getValueType();
unsigned NumInsElts = InsSubVT.getVectorMinNumElements();
unsigned InsIdx = V.getConstantOperandVal(2);
unsigned NumSubElts = NVT.getVectorMinNumElements();
- if (InsIdx <= ExtIdx && (ExtIdx + NumSubElts) <= (InsIdx + NumInsElts) &&
- TLI.isExtractSubvectorCheap(NVT, InsSubVT, ExtIdx - InsIdx) &&
- InsSubVT.isFixedLengthVector() && NVT.isFixedLengthVector() &&
- V.getValueType().isFixedLengthVector())
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, InsSub,
- DAG.getVectorIdxConstant(ExtIdx - InsIdx, DL));
+ if (InsSubVT.isFixedLengthVector() && NVT.isFixedLengthVector() &&
+ V.getValueType().isFixedLengthVector()) {
+ uint64_t ExtEnd = ExtIdx + NumSubElts;
+ uint64_t InsEnd = InsIdx + NumInsElts;
+ if (ExtEnd <= InsIdx || InsEnd <= ExtIdx)
+ return DAG.getExtractSubvector(DL, NVT, Src, ExtIdx);
+
+ if (InsIdx <= ExtIdx && ExtEnd <= InsEnd &&
+ TLI.isExtractSubvectorCheap(NVT, InsSubVT, ExtIdx - InsIdx))
+ return DAG.getExtractSubvector(DL, NVT, InsSub, ExtIdx - InsIdx);
+
+ if (ExtIdx <= InsIdx && InsEnd <= ExtEnd &&
+ InsSubVT.getVectorElementType() == NVT.getVectorElementType() &&
+ (InsIdx - ExtIdx) % NumInsElts == 0 &&
+ hasOperation(ISD::INSERT_SUBVECTOR, NVT)) {
+ SDValue NewExtract = DAG.getExtractSubvector(DL, NVT, Src, ExtIdx);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NVT, NewExtract, InsSub,
+ DAG.getVectorIdxConstant(InsIdx - ExtIdx, DL));
+ }
+ }
}
// Try to move vector bitcast after extract_subv by scaling extraction index:
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-freeze-extract-subvector-loop.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-freeze-extract-subvector-loop.ll
new file mode 100644
index 0000000000000..8e929b55bc1f1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-freeze-extract-subvector-loop.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -O2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+; Regression test for an infinite DAGCombine loop involving freeze sinking
+; through extract_subvector users of this shuffle/select chain.
+; See https://github.com/ROCm/llvm-project/issues/2616 for the original report.
+define amdgpu_kernel void @freeze_loop(<2 x i16> %0, i1 %1) {
+; CHECK-LABEL: freeze_loop:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_bitcmp1_b32 s1, 0
+; CHECK-NEXT: s_cselect_b32 s0, s0, 0x10001
+; CHECK-NEXT: v_mov_b32_e32 v1, s0
+; CHECK-NEXT: ds_write_b32 v0, v1
+; CHECK-NEXT: s_endpgm
+ %3 = shufflevector <2 x i16> %0, <2 x i16> zeroinitializer, <23 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %4 = select i1 %1, <23 x i16> %3, <23 x i16> zeroinitializer
+ %5 = shufflevector <23 x i16> %4, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 23, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+ %6 = select i1 %1, <23 x i16> zeroinitializer, <23 x i16> %5
+ %7 = shufflevector <23 x i16> %6, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 24, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+ %8 = select i1 %1, <23 x i16> zeroinitializer, <23 x i16> %7
+ %9 = shufflevector <23 x i16> %8, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 23, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+ %10 = shufflevector <23 x i16> %4, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 23, i32 poison, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+ %11 = select i1 %1, <23 x i16> zeroinitializer, <23 x i16> %9
+ %12 = shufflevector <23 x i16> %11, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 24, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+ %13 = select i1 %1, <23 x i16> %10, <23 x i16> %12
+ %14 = shufflevector <23 x i16> %13, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 23, i32 poison, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+ %15 = select i1 %1, <23 x i16> %14, <23 x i16> %10
+ %16 = shufflevector <23 x i16> %15, <23 x i16> %3, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 23, i32 poison, i32 18, i32 19, i32 20, i32 21, i32 22>
+ %17 = shufflevector <23 x i16> %15, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+ %18 = select i1 %1, <23 x i16> %16, <23 x i16> %17
+ %19 = shufflevector <23 x i16> %18, <23 x i16> %3, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 23, i32 poison, i32 20, i32 21, i32 22>
+ %20 = shufflevector <23 x i16> %13, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 23, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+ %21 = select i1 %1, <23 x i16> %19, <23 x i16> %20
+ %22 = shufflevector <23 x i16> %21, <23 x i16> %3, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 23, i32 poison, i32 22>
+ %23 = select i1 %1, <23 x i16> %22, <23 x i16> splat (i16 1)
+ %24 = shufflevector <23 x i16> %23, <23 x i16> zeroinitializer, <2 x i32> <i32 20, i32 21>
+ store <2 x i16> %24, ptr addrspace(3) null, align 2
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
index 7dbff047e4f87..6967b87a47b81 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
@@ -543,9 +543,8 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1
; AVX512-NEXT: vmovdqa %xmm0, 32(%rcx)
-; AVX512-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512-NEXT: vmovdqa %ymm4, (%rcx)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -564,9 +563,8 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1
; AVX512-FCP-NEXT: vmovdqa %xmm0, 32(%rcx)
-; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rcx)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -588,9 +586,8 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1
; AVX512DQ-NEXT: vmovdqa %xmm0, 32(%rcx)
-; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512DQ-NEXT: vmovdqa %ymm4, (%rcx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -609,9 +606,8 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 32(%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rcx)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
index bc7ed7552e77c..80b11e572050f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
@@ -489,28 +489,27 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX512-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm8 = [0,3,7,0]
-; AVX512-NEXT: vpermi2d %xmm6, %xmm7, %xmm8
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm8[1,2],xmm6[3]
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29]
-; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
-; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
-; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm1
-; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX512-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa %xmm6, 32(%rax)
-; AVX512-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6
+; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29]
+; AVX512-NEXT: vpshufb %ymm7, %ymm6, %ymm6
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8
+; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2]
+; AVX512-NEXT: vpshufb %ymm7, %ymm8, %ymm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm7
+; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,0,2]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,3,7,0]
+; AVX512-NEXT: vpermi2d %xmm2, %xmm0, %xmm1
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; AVX512-NEXT: vmovdqa %xmm0, 32(%rax)
+; AVX512-NEXT: vmovdqa %ymm6, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -543,9 +542,8 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,5,6,3]
; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm0
; AVX512-FCP-NEXT: vmovdqa %xmm2, 32(%rax)
-; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -558,28 +556,27 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm8 = [0,3,7,0]
-; AVX512DQ-NEXT: vpermi2d %xmm6, %xmm7, %xmm8
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm8[1,2],xmm6[3]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29]
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm1
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm0
-; AVX512DQ-NEXT: vmovdqa %xmm6, 32(%rax)
-; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29]
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm6, %ymm6
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2]
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm8, %ymm7
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm7
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,0,2]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,3,7,0]
+; AVX512DQ-NEXT: vpermi2d %xmm2, %xmm0, %xmm1
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; AVX512DQ-NEXT: vmovdqa %xmm0, 32(%rax)
+; AVX512DQ-NEXT: vmovdqa %ymm6, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -612,9 +609,8 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,5,6,3]
; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 32(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
index cba11be4d8456..3d5a6a36377f7 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
@@ -135,16 +135,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vmovdqa (%rdx), %xmm1
; AVX512-NEXT: vmovdqa (%r8), %xmm2
+; AVX512-NEXT: vmovdqa (%r9), %xmm3
; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
-; AVX512-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
-; AVX512-NEXT: vmovdqa %ymm0, 64(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4
+; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3]
+; AVX512-NEXT: vpermi2q %zmm4, %zmm0, %zmm5
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0
+; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7]
+; AVX512-NEXT: vpermi2q %ymm0, %ymm1, %ymm2
+; AVX512-NEXT: vmovdqa %ymm2, 64(%rax)
+; AVX512-NEXT: vmovdqa64 %zmm5, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -154,16 +156,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2
+; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm3
; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
-; AVX512-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
-; AVX512-FCP-NEXT: vmovdqa %ymm0, 64(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3]
+; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm5
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7]
+; AVX512-FCP-NEXT: vpermi2q %ymm0, %ymm1, %ymm2
+; AVX512-FCP-NEXT: vmovdqa %ymm2, 64(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -173,16 +177,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2
+; AVX512DQ-NEXT: vmovdqa (%r9), %xmm3
; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
-; AVX512DQ-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4
+; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3]
+; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm0, %zmm5
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0
+; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7]
+; AVX512DQ-NEXT: vpermi2q %ymm0, %ymm1, %ymm2
+; AVX512DQ-NEXT: vmovdqa %ymm2, 64(%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -192,16 +198,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm3
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
-; AVX512DQ-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 64(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm5
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7]
+; AVX512DQ-FCP-NEXT: vpermi2q %ymm0, %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, 64(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -211,16 +219,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1
; AVX512BW-NEXT: vmovdqa (%r8), %xmm2
+; AVX512BW-NEXT: vmovdqa (%r9), %xmm3
; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
-; AVX512BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4
+; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3]
+; AVX512BW-NEXT: vpermi2q %zmm4, %zmm0, %zmm5
+; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0
+; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7]
+; AVX512BW-NEXT: vpermi2q %ymm0, %ymm1, %ymm2
+; AVX512BW-NEXT: vmovdqa %ymm2, 64(%rax)
+; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -230,16 +240,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2
+; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm3
; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
-; AVX512BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm5
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7]
+; AVX512BW-FCP-NEXT: vpermi2q %ymm0, %ymm1, %ymm2
+; AVX512BW-FCP-NEXT: vmovdqa %ymm2, 64(%rax)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -249,16 +261,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2
+; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm3
; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
-; AVX512DQ-BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT: vmovdqa %ymm0, 64(%rax)
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4
+; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3]
+; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm0, %zmm5
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0
+; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7]
+; AVX512DQ-BW-NEXT: vpermi2q %ymm0, %ymm1, %ymm2
+; AVX512DQ-BW-NEXT: vmovdqa %ymm2, 64(%rax)
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -268,16 +282,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm3
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm0, %ymm1, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <2 x i64>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
index d6c65fa82fa6c..f78700d8cd9a1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
@@ -945,9 +945,8 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512BW-NEXT: movw $18724, %cx # imm = 0x4924
; AVX512BW-NEXT: kmovd %ecx, %k1
; AVX512BW-NEXT: vmovdqu16 %ymm2, %ymm0 {%k1}
-; AVX512BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa %xmm1, 32(%rax)
; AVX512BW-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512BW-NEXT: vmovdqa %xmm1, 32(%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -982,9 +981,8 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512BW-FCP-NEXT: movw $18724, %cx # imm = 0x4924
; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
; AVX512BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm0 {%k1}
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax)
; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -1019,9 +1017,8 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-BW-NEXT: movw $18724, %cx # imm = 0x4924
; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
; AVX512DQ-BW-NEXT: vmovdqu16 %ymm2, %ymm0 {%k1}
-; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT: vmovdqa %xmm1, 32(%rax)
; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512DQ-BW-NEXT: vmovdqa %xmm1, 32(%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -1056,9 +1053,8 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-BW-FCP-NEXT: movw $18724, %cx # imm = 0x4924
; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm0 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax)
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
index 76dfd019c0883..aaeab617d56c8 100644
--- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
@@ -3207,47 +3207,45 @@ define void @mask_replication_factor6_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-ONLY-NEXT: vpmovm2w %k1, %zmm0
; AVX512BW-ONLY-NEXT: vpmovm2w %k0, %zmm1
; AVX512BW-ONLY-NEXT: movl $1, %eax
-; AVX512BW-ONLY-NEXT: kmovd %eax, %k2
-; AVX512BW-ONLY-NEXT: vmovdqu16 %zmm0, %zmm1 {%k2}
-; AVX512BW-ONLY-NEXT: vpmovw2m %zmm1, %k2
-; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2
-; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z}
-; AVX512BW-ONLY-NEXT: vpmovm2w %k1, %zmm2
+; AVX512BW-ONLY-NEXT: kmovd %eax, %k1
+; AVX512BW-ONLY-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-NEXT: vpmovw2m %zmm1, %k1
+; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1
+; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z}
; AVX512BW-ONLY-NEXT: vpmovsxbw {{.*#+}} zmm3 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
-; AVX512BW-ONLY-NEXT: vpermw %zmm2, %zmm3, %zmm2
-; AVX512BW-ONLY-NEXT: vpmovw2m %zmm2, %k1
+; AVX512BW-ONLY-NEXT: vpermw %zmm0, %zmm3, %zmm0
+; AVX512BW-ONLY-NEXT: vpmovw2m %zmm0, %k1
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
-; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k2} {z}
+; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm0 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k1} {z}
; AVX512BW-ONLY-NEXT: kshiftrq $48, %k0, %k1
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k1} {z}
; AVX512BW-ONLY-NEXT: kshiftrq $32, %k0, %k1
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z}
-; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx)
+; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 128(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 192(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 256(%rdx)
-; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 320(%rdx)
-; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 320(%rdx)
+; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512BW-ONLY-NEXT: vzeroupper
; AVX512BW-ONLY-NEXT: retq
;
; AVX512VBMI-ONLY-LABEL: mask_replication_factor6_vf16:
; AVX512VBMI-ONLY: # %bb.0:
-; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k2
-; AVX512VBMI-ONLY-NEXT: vpmovm2b %k2, %zmm0
+; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k1
+; AVX512VBMI-ONLY-NEXT: vpmovm2b %k1, %zmm0
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7,8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k0
; AVX512VBMI-ONLY-NEXT: vpmovm2w %k0, %zmm0
-; AVX512VBMI-ONLY-NEXT: vpmovm2w %k2, %zmm1
+; AVX512VBMI-ONLY-NEXT: vpmovm2w %k1, %zmm1
; AVX512VBMI-ONLY-NEXT: movl $1, %eax
; AVX512VBMI-ONLY-NEXT: kmovd %eax, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512VBMI-ONLY-NEXT: vpmovw2m %zmm0, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT: vpmovm2w %k2, %zmm1
; AVX512VBMI-ONLY-NEXT: vpmovsxbw {{.*#+}} zmm2 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
; AVX512VBMI-ONLY-NEXT: vpermw %zmm1, %zmm2, %zmm1
; AVX512VBMI-ONLY-NEXT: vpmovw2m %zmm1, %k2
More information about the llvm-branch-commits
mailing list