[llvm] [X86] combineX86ShufflesRecursively - iteratively peek through bitcasts to free subvector widening/narrowing sources. (PR #134701)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 7 11:22:32 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
Generalizes the existing code to repeatedly peek though mixed bitcast/insert_subvector/extract_subvector chains to find the source of the shuffle operand.
---
Patch is 937.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134701.diff
11 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+29-22)
- (modified) llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll (+5-8)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll (+8-8)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll (+190-190)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll (+2796-2782)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll (+44-44)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll (+600-594)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll (+448-468)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll (+970-970)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll (+111-363)
- (modified) llvm/test/CodeGen/X86/x86-interleaved-access.ll (+11-11)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bac5684733e60..d86eec1584274 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41115,30 +41115,37 @@ static SDValue combineX86ShufflesRecursively(
}
}
+ // Peek through any free bitcasts to insert_subvector vector widenings or
+ // extract_subvector nodes back to root size.
+ // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
for (auto [I, Op] : enumerate(Ops)) {
- // Peek through vector widenings + set out of bounds mask indices to undef.
- // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
- if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&
- isNullConstant(Op.getOperand(2))) {
- Op = Op.getOperand(1);
- unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
- int Lo = I * Mask.size();
- int Hi = (I + 1) * Mask.size();
- int NewHi = Lo + (Mask.size() / Scale);
- for (int &M : Mask) {
- if (Lo <= M && NewHi <= M && M < Hi)
- M = SM_SentinelUndef;
- }
- }
-
- // Peek through any free bitcasts/extract_subvector nodes back to root size.
SDValue BC = Op;
- if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse())
- BC = peekThroughOneUseBitcasts(BC);
- while (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- (RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 &&
- isNullConstant(BC.getOperand(1))) {
- Op = BC = BC.getOperand(0);
+ while (1) {
+ if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse()) {
+ BC = BC.getOperand(0);
+ continue;
+ }
+ if (BC.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ BC.getOperand(0).isUndef() && isNullConstant(BC.getOperand(2))) {
+ // Set out of bounds mask indices to undef.
+ Op = BC = BC.getOperand(1);
+ unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
+ int Lo = I * Mask.size();
+ int Hi = (I + 1) * Mask.size();
+ int NewHi = Lo + (Mask.size() / Scale);
+ for (int &M : Mask) {
+ if (Lo <= M && NewHi <= M && M < Hi)
+ M = SM_SentinelUndef;
+ }
+ continue;
+ }
+ if (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ (RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 &&
+ isNullConstant(BC.getOperand(1))) {
+ Op = BC = BC.getOperand(0);
+ continue;
+ }
+ break;
}
}
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index b075d48627b18..1fada58f05ba9 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -4708,18 +4708,15 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
-; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
+; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3
+; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
+; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
; AVX-NEXT: vmovdqa %xmm2, (%rcx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
-; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
index a39bc6b668669..da902b3aed5ab 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
@@ -1836,7 +1836,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
; AVX512-NEXT: vpshufb %ymm11, %ymm10, %ymm10
; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512-NEXT: vpshufb %xmm11, %xmm12, %xmm12
; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
@@ -1858,7 +1858,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm5
; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8))
@@ -1914,7 +1914,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
@@ -1936,7 +1936,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8))
@@ -1992,7 +1992,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
; AVX512DQ-NEXT: vpshufb %ymm11, %ymm10, %ymm10
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512DQ-NEXT: vpshufb %xmm11, %xmm12, %xmm12
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
@@ -2014,7 +2014,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm5
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8))
@@ -2070,7 +2070,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
@@ -2092,7 +2092,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8))
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
index 25bad7578c111..ca8fcf2ee0f2c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
@@ -4230,91 +4230,91 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm12
; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm4
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,2,1,8,9,8,9]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [8,8,0,9,0,1,0,1]
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm5
+; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm1
; AVX512-FCP-NEXT: movw $9362, %ax # imm = 0x2492
; AVX512-FCP-NEXT: kmovw %eax, %k2
-; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm5 {%k2}
-; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [16,9,10,17,12,13,18,15]
+; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k2}
+; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [16,9,10,17,12,13,18,15]
; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm10
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,1,3,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm5
-; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm20, %zmm1
+; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,1,8,3,4,9,6,7]
-; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm19, %ymm1
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm17
-; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm8
+; AVX512-FCP-NEXT: vpermt2d %ymm6, %ymm19, %ymm3
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm17
+; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm9
; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm7
-; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm3
-; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm5
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm2
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm4
-; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k2}
+; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm5
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm8
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm2
+; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm2 {%k2}
; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm3
-; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm4, %ymm19
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm4
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,8,8,10,9]
-; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm9
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm2
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm5
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm19, %zmm19
+; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX512-FCP-NEXT: vpermi2d %ymm4, %ymm2, %ymm19
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm20, %zmm2
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,8,8,0,9]
+; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm8
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm0
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[0,0,2,1,4,5,6,7]
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm6
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm19, %zmm19
; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm6
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm0
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm4
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm4
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [1,1,1,1,10,10,10,11]
-; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm0
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm6
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm1
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm5
; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm7
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
-; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm6
-; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1}
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
+; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm7
+; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm7 {%k1}
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm1
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,9,2,3,8,5,6,11]
-; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm8
-; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm3, %ymm8
+; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm14
+; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm3, %ymm14
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [8,9,20,11,12,21,14,15]
; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm1
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm15, %zmm6
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm6
+; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm15, %zmm7
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm14, %zmm7
; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm1
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,0,1,10,10,10,10]
-; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm2
-; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm8
-; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm14
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm8[0],ymm14[0],ymm8[1],ymm14[1],ymm8[2],ymm14[2],ymm8[3],ymm14[3],ymm8[8],ymm14[8],ymm8[9],ymm14[9],ymm8[10],ymm14[10],ymm8[11],ymm14[11]
+; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm14
+; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm5
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm14[0],ymm5[0],ymm14[1],ymm5[1],ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[8],ymm5[8],ymm14[9],ymm5[9],ymm14[10],ymm5[10],ymm14[11],ymm5[11]
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5]...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/134701
More information about the llvm-commits
mailing list