[llvm] [X86] collectConcatOps - handle extract_subvector(concat_subvectors(...)) patterns (PR #143406)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 9 09:10:35 PDT 2025
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/143406
None
>From 9fcb61c59ade17b958411fae90adafa795918a2b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 9 Jun 2025 16:57:19 +0100
Subject: [PATCH] [X86] collectConcatOps - handle
extract_subvector(concat_subvectors(...)) patterns
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 19 ++++
.../vector-interleaved-store-i8-stride-3.ll | 88 +++++++++----------
.../CodeGen/X86/x86-interleaved-access.ll | 11 ++-
3 files changed, 64 insertions(+), 54 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 244b0f9410b85..9ea45513cc019 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4311,6 +4311,25 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops,
}
}
+ if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+ uint64_t Idx = N->getConstantOperandVal(1);
+
+ // Collect all the subvectors from the source vector and slice off the
+ // extraction.
+ SmallVector<SDValue, 4> SrcOps;
+ if (collectConcatOps(Src.getNode(), SrcOps, DAG) &&
+ VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() &&
+ (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
+ (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
+ unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
+ unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits();
+ Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs);
+ return true;
+ }
+ }
+
return false;
}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
index 782a81be47603..674bad2c7aa87 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
@@ -740,16 +740,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512-NEXT: vmovdqa %xmm0, 16(%rcx)
+; AVX512-NEXT: vmovdqa %xmm1, (%rcx)
; AVX512-NEXT: vmovdqa %xmm2, 32(%rcx)
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: store_i8_stride3_vf16:
@@ -763,16 +762,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vmovdqa %xmm0, 16(%rcx)
+; AVX512-FCP-NEXT: vmovdqa %xmm1, (%rcx)
; AVX512-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: store_i8_stride3_vf16:
@@ -786,16 +784,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT: vmovdqa %xmm0, 16(%rcx)
+; AVX512DQ-NEXT: vmovdqa %xmm1, (%rcx)
; AVX512DQ-NEXT: vmovdqa %xmm2, 32(%rcx)
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: store_i8_stride3_vf16:
@@ -809,16 +806,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 16(%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%rcx)
; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: store_i8_stride3_vf16:
@@ -832,16 +828,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vmovdqa %xmm0, 16(%rcx)
+; AVX512BW-NEXT: vmovdqa %xmm1, (%rcx)
; AVX512BW-NEXT: vmovdqa %xmm2, 32(%rcx)
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512BW-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BW-FCP-LABEL: store_i8_stride3_vf16:
@@ -855,16 +850,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT: vmovdqa %xmm0, 16(%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%rcx)
; AVX512BW-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: store_i8_stride3_vf16:
@@ -878,16 +872,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT: vmovdqa %xmm0, 16(%rcx)
+; AVX512DQ-BW-NEXT: vmovdqa %xmm1, (%rcx)
; AVX512DQ-BW-NEXT: vmovdqa %xmm2, 32(%rcx)
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
; AVX512DQ-BW-FCP-LABEL: store_i8_stride3_vf16:
@@ -901,16 +894,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, 16(%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%rcx)
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64
%in.vec1 = load <16 x i8>, ptr %in.vecptr1, align 64
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index 47a6022e428c3..7cddebdca5cca 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -962,16 +962,15 @@ define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512-NEXT: vmovdqu %xmm0, 16(%rdi)
+; AVX512-NEXT: vmovdqu %xmm1, (%rdi)
; AVX512-NEXT: vmovdqu %xmm2, 32(%rdi)
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqu %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
%2 = shufflevector <16 x i8> %c, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
More information about the llvm-commits
mailing list