[llvm] [X86] getFauxShuffleMask - match 256-bit CONCAT(SUB0, SUB1) 64-bit elt patterns as well as 512-bit (PR #127392)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Feb 16 05:09:33 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
The 512-bit filter was to prevent AVX1/2 regressions, but most of that is now handled by canonicalizeShuffleWithOp
Ideally we need to support smaller element widths as well.
Noticed while triaging #<!-- -->116931
---
Patch is 150.22 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127392.diff
15 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+2-4)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll (+41-45)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll (+34-40)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll (+35-35)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll (+98-98)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll (+72-80)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll (+36-40)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll (+33-33)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll (+112-112)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll (+90-98)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll (+56-12)
- (modified) llvm/test/CodeGen/X86/widen_fadd.ll (+20-64)
- (modified) llvm/test/CodeGen/X86/widen_fdiv.ll (+7-38)
- (modified) llvm/test/CodeGen/X86/widen_fmul.ll (+20-64)
- (modified) llvm/test/CodeGen/X86/widen_fsub.ll (+20-64)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9592137b34842..21b08a4a93fc7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -6130,11 +6130,9 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
return true;
}
// Handle CONCAT(SUB0, SUB1).
- // Limit this to vXi64 512-bit vector cases to make the most of AVX512
- // cross lane shuffles.
+ // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
- NumBitsPerElt == 64 && NumSizeInBits == 512 &&
- Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ NumBitsPerElt == 64 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
Src.getOperand(0).isUndef() &&
Src.getOperand(1).getValueType() == SubVT &&
Src.getConstantOperandVal(2) == 0) {
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
index 0beb304a5673d..4a2e7d55d3e88 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
@@ -211,10 +211,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovq %xmm1, 16(%rcx)
@@ -228,10 +228,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-FP-NEXT: vmovq %xmm1, 16(%rcx)
@@ -245,10 +245,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-FCP-NEXT: vmovq %xmm1, 16(%rcx)
@@ -262,10 +262,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vmovq %xmm1, 16(%rcx)
@@ -279,10 +279,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-FCP-NEXT: vmovq %xmm1, 16(%rcx)
@@ -296,10 +296,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vmovq %xmm1, 16(%rcx)
@@ -313,10 +313,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-FCP-NEXT: vmovq %xmm1, 16(%rcx)
@@ -330,12 +330,11 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
-; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vmovq %xmm1, 16(%rcx)
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rcx)
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0]
+; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
+; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512BW-NEXT: vmovq %xmm0, 16(%rcx)
+; AVX512BW-NEXT: vmovdqa %xmm2, (%rcx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -345,12 +344,11 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-FCP-NEXT: vmovq %xmm1, 16(%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rcx)
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, 16(%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rcx)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -360,12 +358,11 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-BW-NEXT: vmovq %xmm1, 16(%rcx)
-; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rcx)
+; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, 16(%rcx)
+; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rcx)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -375,12 +372,11 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, 16(%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, 16(%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rcx)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
index 704c92924abfb..71eb606a8665d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
@@ -217,12 +217,12 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,8,9,0,1,8,9,2,3,10,11,2,3,10,11,20,21,28,29,20,21,28,29,22,23,30,31,22,23,30,31]
-; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-NEXT: vmovdqa %ymm0, (%r8)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -235,12 +235,12 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,8,9,0,1,8,9,2,3,10,11,2,3,10,11,20,21,28,29,20,21,28,29,22,23,30,31,22,23,30,31]
-; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm2
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-FP-NEXT: vpshufb %ymm1, %ymm2, %ymm1
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-FP-NEXT: vmovdqa %ymm0, (%r8)
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
@@ -269,12 +269,12 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,8,9,0,1,8,9,2,3,10,11,2,3,10,11,20,21,28,29,20,21,28,29,22,23,30,31,22,23,30,31]
-; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm2
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
; AVX512-NEXT: vmovdqa %ymm0, (%r8)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -287,10 +287,9 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
-; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11]
+; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
; AVX512-FCP-NEXT: vmovdqa %ymm0, (%r8)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
@@ -303,12 +302,12 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,8,9,0,1,8,9,2,3,10,11,2,3,10,11,20,21,28,29,20,21,28,29,22,23,30,31,22,23,30,31]
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
; AVX512DQ-NEXT: vmovdqa %ymm0, (%r8)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -321,10 +320,9 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11]
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%r8)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
@@ -337,10 +335,9 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512BW-NEXT: vpun...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/127392
More information about the llvm-commits
mailing list