[llvm] ef46046 - [X86] combineConcatVectorOps - add handling for X86ISD::VPERM2X128 nodes.
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 11 02:01:29 PDT 2023
Author: Simon Pilgrim
Date: 2023-08-11T10:01:13+01:00
New Revision: ef46046060352745504877b3b07bca1305a3eb58
URL: https://github.com/llvm/llvm-project/commit/ef46046060352745504877b3b07bca1305a3eb58
DIFF: https://github.com/llvm/llvm-project/commit/ef46046060352745504877b3b07bca1305a3eb58.diff
LOG: [X86] combineConcatVectorOps - add handling for X86ISD::VPERM2X128 nodes.
On AVX512 targets we can concatenate these and create a X86ISD::SHUF128 node.
Prevents regression on some future work to improve codegen for concat_vectors(extract_subvector(),extract_subvector()) (mainly via vector widening) patterns.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/x86-interleaved-access.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4469aada96716b..f1fc49a0b65c31 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -54594,6 +54594,29 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
}
}
break;
+ case X86ISD::VPERM2X128: {
+ if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
+ assert(NumOps == 2 && "Bad concat_vectors operands");
+ unsigned Imm0 = Ops[0].getConstantOperandVal(2);
+ unsigned Imm1 = Ops[1].getConstantOperandVal(2);
+ // TODO: Handle zero'd subvectors.
+ if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
+ int Mask[4] = {(Imm0 & 0x03), ((Imm0 >> 4) & 0x3), (Imm1 & 0x03),
+ ((Imm1 >> 4) & 0x3)};
+ MVT ShuffleVT = EltSizeInBits >= 32 ? VT : MVT::v8i64;
+ SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
+ Ops[0].getOperand(1), DAG, DL);
+ SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
+ Ops[1].getOperand(1), DAG, DL);
+ SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
+ DAG.getBitcast(ShuffleVT, LHS),
+ DAG.getBitcast(ShuffleVT, RHS),
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+ return DAG.getBitcast(VT, Res);
+ }
+ }
+ break;
+ }
case X86ISD::SHUF128: {
if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
unsigned Imm0 = Ops[0].getConstantOperandVal(2);
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index 799c11d7c7b4a5..24b4dd785bfcf4 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -313,10 +313,10 @@ define void @interleaved_store_vf32_i8_stride4(<32 x i8> %x1, <32 x i8> %x2, <32
; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5
-; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,6,7],zmm0[2,3,6,7]
; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rdi)
; AVX512-NEXT: vmovdqa64 %zmm2, (%rdi)
; AVX512-NEXT: vzeroupper
@@ -1601,22 +1601,22 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x
; AVX512-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31]
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5
-; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm3[2,3],ymm1[2,3]
-; AVX512-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm4[2,3],ymm0[2,3]
-; AVX512-NEXT: vextracti64x4 $1, %zmm3, %ymm3
-; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm8
-; AVX512-NEXT: vextracti64x4 $1, %zmm4, %ymm4
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm9
-; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
+; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm6
+; AVX512-NEXT: vextracti64x4 $1, %zmm3, %ymm7
+; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6
+; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm8
+; AVX512-NEXT: vextracti64x4 $1, %zmm4, %ymm9
+; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8
; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
-; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm3
-; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm4
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rdi)
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm4
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm3
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,6,7],zmm4[2,3,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm4
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm0[4,5,6,7]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm1[4,5,6,7]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,6,7],zmm0[2,3,6,7]
; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rdi)
+; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rdi)
; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rdi)
; AVX512-NEXT: vmovdqa64 %zmm2, (%rdi)
; AVX512-NEXT: vzeroupper
More information about the llvm-commits
mailing list