[llvm] 84e46aa - [X86] combineConcatVectorOps - add handling to concat setcc instructions together (#170295)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 2 06:50:55 PST 2025
Author: Simon Pilgrim
Date: 2025-12-02T14:50:51Z
New Revision: 84e46aa62d66fab59c0b3beee7b4b154d62eeb0f
URL: https://github.com/llvm/llvm-project/commit/84e46aa62d66fab59c0b3beee7b4b154d62eeb0f
DIFF: https://github.com/llvm/llvm-project/commit/84e46aa62d66fab59c0b3beee7b4b154d62eeb0f.diff
LOG: [X86] combineConcatVectorOps - add handling to concat setcc instructions together (#170295)
So far this only handles AVX512 predicate masks, which is by far the
easiest to support - AVX1/AVX2 support can mostly be dealt with via CMPP
+ CMPEQ/GT nodes (but these still fail for some icmp expansions where
nodes have multiple uses).
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll
llvm/test/CodeGen/X86/combine-icmp.ll
llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll
llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 427c18a4bb576..9da121dd9ab87 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -59427,6 +59427,31 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
}
}
break;
+ case ISD::SETCC:
+ if (!IsSplat && EltSizeInBits == 1 &&
+ llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op0.getOperand(0).getValueType() ==
+ Op.getOperand(0).getValueType() &&
+ Op0.getOperand(2) == Op.getOperand(2);
+ })) {
+ EVT SrcVT = Op0.getOperand(0).getValueType();
+ EVT NewSrcVT = EVT::getVectorVT(Ctx, SrcVT.getScalarType(),
+ NumOps * SrcVT.getVectorNumElements());
+ unsigned SrcSizeInBits = SrcVT.getScalarSizeInBits();
+ if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(NewSrcVT) &&
+ (NewSrcVT.is256BitVector() ||
+ (NewSrcVT.is512BitVector() && Subtarget.useAVX512Regs() &&
+ (SrcSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
+ SDValue LHS = CombineSubOperand(NewSrcVT.getSimpleVT(), Ops, 0);
+ SDValue RHS = CombineSubOperand(NewSrcVT.getSimpleVT(), Ops, 1);
+ if (LHS || RHS)
+ return DAG.getNode(Opcode, DL, VT,
+ LHS ? LHS : ConcatSubOperand(NewSrcVT, Ops, 0),
+ RHS ? RHS : ConcatSubOperand(NewSrcVT, Ops, 1),
+ Op0.getOperand(2));
+ }
+ }
+ break;
case ISD::CTPOP:
case ISD::CTTZ:
case ISD::CTLZ:
@@ -59791,13 +59816,16 @@ static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,
}
}
- // Attempt to merge logic ops if the type is legal.
- if (TLI.isTypeLegal(VT) && all_of(Ops, [](SDValue Op) {
- return ISD::isBitwiseLogicOp(Op.getOpcode());
- }))
+ // Attempt to merge comparison/logic ops if the type is legal.
+ if (TLI.isTypeLegal(VT) &&
+ (all_of(Ops, [](SDValue Op) { return Op.getOpcode() == ISD::SETCC; }) ||
+ all_of(Ops, [](SDValue Op) {
+ return ISD::isBitwiseLogicOp(Op.getOpcode());
+ }))) {
if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops,
DAG, Subtarget))
return R;
+ }
// Don't do anything else for i1 vectors.
return SDValue();
diff --git a/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll
index a24c1d8c2fcc4..7fb20418aeda4 100644
--- a/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll
+++ b/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll
@@ -52,13 +52,12 @@ define <8 x i1> @test3(<4 x i1> %a) {
define <8 x i1> @test4(<4 x i1> %a, <4 x i1>%b) {
; CHECK-LABEL: test4:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
-; CHECK-NEXT: vpmovd2m %xmm1, %k0
-; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
-; CHECK-NEXT: vpmovd2m %xmm0, %k1
-; CHECK-NEXT: kshiftlb $4, %k0, %k0
-; CHECK-NEXT: korb %k0, %k1, %k0
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vpslld $31, %ymm0, %ymm0
+; CHECK-NEXT: vpmovd2m %ymm0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -68,13 +67,12 @@ define <8 x i1> @test4(<4 x i1> %a, <4 x i1>%b) {
define <4 x i1> @test5(<2 x i1> %a, <2 x i1>%b) {
; CHECK-LABEL: test5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpsllq $63, %xmm1, %xmm1
-; CHECK-NEXT: vpmovq2m %xmm1, %k0
-; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
-; CHECK-NEXT: vpmovq2m %xmm0, %k1
-; CHECK-NEXT: kshiftlb $2, %k0, %k0
-; CHECK-NEXT: korw %k0, %k1, %k0
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vpsllq $63, %ymm0, %ymm0
+; CHECK-NEXT: vpmovq2m %ymm0, %k0
; CHECK-NEXT: vpmovm2d %k0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/X86/combine-icmp.ll b/llvm/test/CodeGen/X86/combine-icmp.ll
index 603917b52cd5f..dba583905c2c5 100644
--- a/llvm/test/CodeGen/X86/combine-icmp.ll
+++ b/llvm/test/CodeGen/X86/combine-icmp.ll
@@ -83,12 +83,12 @@ define i8 @concat_icmp_v8i32_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
;
; AVX512-LABEL: concat_icmp_v8i32_v4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0
-; AVX512-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; AVX512-NEXT: kshiftlb $4, %k1, %k1
-; AVX512-NEXT: korb %k1, %k0, %k0
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vptestnmd %ymm0, %ymm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%v0 = icmp eq <4 x i32> %a0, zeroinitializer
%v1 = icmp eq <4 x i32> %a1, zeroinitializer
@@ -151,12 +151,12 @@ define i16 @concat_icmp_v16i16_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
;
; AVX512-LABEL: concat_icmp_v16i16_v8i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
-; AVX512-NEXT: vpcmpnleuw %xmm2, %xmm0, %k0
-; AVX512-NEXT: vpcmpnleuw %xmm2, %xmm1, %k1
-; AVX512-NEXT: kunpckbw %k0, %k1, %k0
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%v0 = icmp ugt <8 x i16> %a0, splat (i16 1)
%v1 = icmp ugt <8 x i16> %a1, splat (i16 1)
@@ -199,11 +199,11 @@ define i32 @concat_icmp_v32i8_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
;
; AVX512-LABEL: concat_icmp_v32i8_v16i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
-; AVX512-NEXT: vpcmpgtb %xmm2, %xmm0, %k0
-; AVX512-NEXT: vpcmpgtb %xmm2, %xmm1, %k1
-; AVX512-NEXT: kunpckwd %k0, %k1, %k0
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%v0 = icmp sgt <16 x i8> %a0, splat (i8 5)
%v1 = icmp sgt <16 x i8> %a1, splat (i8 5)
@@ -329,21 +329,15 @@ define i8 @concat_icmp_v8i64_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2,
;
; AVX512-LABEL: concat_icmp_v8i64_v2i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm4 = [128,128]
-; AVX512-NEXT: vpcmpltuq %xmm4, %xmm0, %k0
-; AVX512-NEXT: vpcmpltuq %xmm4, %xmm1, %k1
-; AVX512-NEXT: vpcmpltuq %xmm4, %xmm2, %k2
-; AVX512-NEXT: vpcmpltuq %xmm4, %xmm3, %k3
-; AVX512-NEXT: kshiftlb $2, %k3, %k3
-; AVX512-NEXT: korb %k3, %k2, %k2
-; AVX512-NEXT: kshiftlb $4, %k2, %k2
-; AVX512-NEXT: kshiftlb $2, %k1, %k1
-; AVX512-NEXT: korw %k1, %k0, %k0
-; AVX512-NEXT: kshiftlb $4, %k0, %k0
-; AVX512-NEXT: kshiftrb $4, %k0, %k0
-; AVX512-NEXT: korb %k2, %k0, %k0
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%v0 = icmp ult <2 x i64> %a0, splat (i64 128)
%v1 = icmp ult <2 x i64> %a1, splat (i64 128)
@@ -387,18 +381,16 @@ define i16 @concat_icmp_v16i32_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2
;
; AVX512-LABEL: concat_icmp_v16i32_v4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512-NEXT: vpcmpgtd %xmm4, %xmm0, %k0
-; AVX512-NEXT: vpcmpgtd %xmm4, %xmm1, %k1
-; AVX512-NEXT: vpcmpgtd %xmm4, %xmm2, %k2
-; AVX512-NEXT: vpcmpgtd %xmm4, %xmm3, %k3
-; AVX512-NEXT: kshiftlb $4, %k1, %k1
-; AVX512-NEXT: korb %k1, %k0, %k0
-; AVX512-NEXT: kshiftlb $4, %k3, %k1
-; AVX512-NEXT: korb %k1, %k2, %k1
-; AVX512-NEXT: kunpckbw %k0, %k1, %k0
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%v0 = icmp sgt <4 x i32> %a0, zeroinitializer
%v1 = icmp sgt <4 x i32> %a1, zeroinitializer
@@ -468,14 +460,14 @@ define i32 @concat_icmp_v32i16_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2
;
; AVX512-LABEL: concat_icmp_v32i16_v8i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vptestmw %xmm0, %xmm0, %k0
-; AVX512-NEXT: vptestmw %xmm1, %xmm1, %k1
-; AVX512-NEXT: vptestmw %xmm2, %xmm2, %k2
-; AVX512-NEXT: vptestmw %xmm3, %xmm3, %k3
-; AVX512-NEXT: kunpckbw %k0, %k1, %k0
-; AVX512-NEXT: kunpckbw %k2, %k3, %k1
-; AVX512-NEXT: kunpckwd %k0, %k1, %k0
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vptestmw %zmm0, %zmm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%v0 = icmp ne <8 x i16> %a0, zeroinitializer
%v1 = icmp ne <8 x i16> %a1, zeroinitializer
@@ -560,15 +552,14 @@ define i64 @concat_icmp_v64i8_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2,
;
; AVX512-LABEL: concat_icmp_v64i8_v16i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512-NEXT: vpcmpnleub %xmm4, %xmm0, %k0
-; AVX512-NEXT: vpcmpnleub %xmm4, %xmm1, %k1
-; AVX512-NEXT: vpcmpnleub %xmm4, %xmm2, %k2
-; AVX512-NEXT: vpcmpnleub %xmm4, %xmm3, %k3
-; AVX512-NEXT: kunpckwd %k0, %k1, %k0
-; AVX512-NEXT: kunpckwd %k2, %k3, %k1
-; AVX512-NEXT: kunpckdq %k0, %k1, %k0
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0
; AVX512-NEXT: kmovq %k0, %rax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%v0 = icmp ugt <16 x i8> %a0, splat (i8 15)
%v1 = icmp ugt <16 x i8> %a1, splat (i8 15)
@@ -672,10 +663,9 @@ define i8 @concat_icmp_v8i64_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
;
; AVX512-LABEL: concat_icmp_v8i64_v4i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vptestnmq %ymm0, %ymm0, %k0
-; AVX512-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; AVX512-NEXT: kshiftlb $4, %k1, %k1
-; AVX512-NEXT: korb %k1, %k0, %k0
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: vzeroupper
@@ -768,10 +758,9 @@ define i16 @concat_icmp_v16i32_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
;
; AVX512-LABEL: concat_icmp_v16i32_v8i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
-; AVX512-NEXT: vpcmpnleud %ymm2, %ymm0, %k0
-; AVX512-NEXT: vpcmpnleud %ymm2, %ymm1, %k1
-; AVX512-NEXT: kunpckbw %k0, %k1, %k0
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: vzeroupper
@@ -830,10 +819,9 @@ define i32 @concat_icmp_v32i16_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
;
; AVX512-LABEL: concat_icmp_v32i16_v16i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
-; AVX512-NEXT: vpcmpgtw %ymm2, %ymm0, %k0
-; AVX512-NEXT: vpcmpgtw %ymm2, %ymm1, %k1
-; AVX512-NEXT: kunpckwd %k0, %k1, %k0
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -903,10 +891,9 @@ define i64 @concat_icmp_v64i8_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
;
; AVX512-LABEL: concat_icmp_v64i8_v32i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512-NEXT: vpcmpgtb %ymm0, %ymm2, %k0
-; AVX512-NEXT: vpcmpgtb %ymm1, %ymm2, %k1
-; AVX512-NEXT: kunpckdq %k0, %k1, %k0
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpcmpltb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0
; AVX512-NEXT: kmovq %k0, %rax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll
index ad08eaffab383..7e00d679d56b2 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll
@@ -43,25 +43,23 @@ define <16 x i8> @testv16i1_sext_v16i8(ptr %p, ptr %q) {
; AVX256-LABEL: testv16i1_sext_v16i8:
; AVX256: # %bb.0:
; AVX256-NEXT: vmovdqa (%rdi), %ymm0
-; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; AVX256-NEXT: vmovdqa (%rsi), %ymm0
-; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
+; AVX256-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
+; AVX256-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
+; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm1, %xmm1
+; AVX256-NEXT: kshiftrw $8, %k1, %k1
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm0, %xmm0
-; AVX256-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX256-NEXT: vpacksswb %xmm0, %xmm1, %xmm0
; AVX256-NEXT: vzeroupper
; AVX256-NEXT: retq
;
; AVX512VL-LABEL: testv16i1_sext_v16i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
-; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
-; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
+; AVX512VL-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
+; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vzeroupper
@@ -70,10 +68,8 @@ define <16 x i8> @testv16i1_sext_v16i8(ptr %p, ptr %q) {
; AVX512F-LABEL: testv16i1_sext_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
-; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
@@ -91,13 +87,13 @@ define <16 x i16> @testv16i1_sext_v16i16(ptr %p, ptr %q) {
; AVX256-LABEL: testv16i1_sext_v16i16:
; AVX256: # %bb.0:
; AVX256-NEXT: vmovdqa (%rdi), %ymm0
-; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; AVX256-NEXT: vmovdqa (%rsi), %ymm0
-; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
+; AVX256-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
+; AVX256-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm1, %xmm1
-; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k2} {z}
+; AVX256-NEXT: kshiftrw $8, %k1, %k1
+; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm0, %xmm0
; AVX256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX256-NEXT: retq
@@ -105,10 +101,8 @@ define <16 x i16> @testv16i1_sext_v16i16(ptr %p, ptr %q) {
; AVX512VL-LABEL: testv16i1_sext_v16i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
-; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
-; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
+; AVX512VL-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
+; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
; AVX512VL-NEXT: retq
@@ -116,10 +110,8 @@ define <16 x i16> @testv16i1_sext_v16i16(ptr %p, ptr %q) {
; AVX512F-LABEL: testv16i1_sext_v16i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
-; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: retq
@@ -173,27 +165,25 @@ define <16 x i8> @testv16i1_zext_v16i8(ptr %p, ptr %q) {
; AVX256-LABEL: testv16i1_zext_v16i8:
; AVX256: # %bb.0:
; AVX256-NEXT: vmovdqa (%rdi), %ymm0
-; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; AVX256-NEXT: vmovdqa (%rsi), %ymm0
-; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
+; AVX256-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
+; AVX256-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
+; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm1, %xmm1
; AVX256-NEXT: vpsrlw $15, %xmm1, %xmm1
+; AVX256-NEXT: kshiftrw $8, %k1, %k1
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm0, %xmm0
; AVX256-NEXT: vpsrlw $15, %xmm0, %xmm0
-; AVX256-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX256-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
; AVX256-NEXT: vzeroupper
; AVX256-NEXT: retq
;
; AVX512VL-LABEL: testv16i1_zext_v16i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
-; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
-; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
+; AVX512VL-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
+; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm0 {%k1} {z} = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vzeroupper
@@ -202,10 +192,8 @@ define <16 x i8> @testv16i1_zext_v16i8(ptr %p, ptr %q) {
; AVX512F-LABEL: testv16i1_zext_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
-; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm0 {%k1} {z} = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
@@ -223,13 +211,13 @@ define <16 x i16> @testv16i1_zext_v16i16(ptr %p, ptr %q) {
; AVX256-LABEL: testv16i1_zext_v16i16:
; AVX256: # %bb.0:
; AVX256-NEXT: vmovdqa (%rdi), %ymm0
-; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; AVX256-NEXT: vmovdqa (%rsi), %ymm0
-; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
+; AVX256-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
+; AVX256-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm1, %xmm1
-; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k2} {z}
+; AVX256-NEXT: kshiftrw $8, %k1, %k1
+; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm0, %xmm0
; AVX256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX256-NEXT: vpsrlw $15, %ymm0, %ymm0
@@ -238,10 +226,8 @@ define <16 x i16> @testv16i1_zext_v16i16(ptr %p, ptr %q) {
; AVX512VL-LABEL: testv16i1_zext_v16i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
-; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
-; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
+; AVX512VL-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
+; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
; AVX512VL-NEXT: vpsrlw $15, %ymm0, %ymm0
@@ -250,10 +236,8 @@ define <16 x i16> @testv16i1_zext_v16i16(ptr %p, ptr %q) {
; AVX512F-LABEL: testv16i1_zext_v16i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
-; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
index 3699c7f75c861..93384341e03a4 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
@@ -18,26 +18,23 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) {
; AVX256VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1
-; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm2 {%k1} {z}
-; AVX256VL-NEXT: vpmovdw %ymm2, %xmm2
-; AVX256VL-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7]
-; AVX256VL-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,12,13,4,5,8,9,6,7,14,15,14,15,0,1]
-; AVX256VL-NEXT: vpmovsxwd %xmm3, %ymm3
-; AVX256VL-NEXT: vpslld $31, %ymm3, %ymm3
-; AVX256VL-NEXT: vptestmd %ymm3, %ymm3, %k1
-; AVX256VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
-; AVX256VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,12,13,2,3,u,u,6,7,u,u,14,15,0,1]
-; AVX256VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4],xmm1[5],xmm2[6,7]
-; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm1
-; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1
-; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k0
-; AVX256VL-NEXT: kunpckbw %k1, %k0, %k0
-; AVX256VL-NEXT: kshiftrw $8, %k0, %k2
-; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
+; AVX256VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,1,3]
+; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm3 {%k1} {z}
+; AVX256VL-NEXT: vpmovdw %ymm3, %xmm3
+; AVX256VL-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[6,7,12,13,2,3,u,u,6,7,u,u,14,15,0,1]
+; AVX256VL-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3],xmm4[4],xmm2[5],xmm4[6,7]
+; AVX256VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4],xmm3[5,6,7]
+; AVX256VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,12,13,4,5,8,9,6,7,14,15,14,15,0,1]
+; AVX256VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX256VL-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX256VL-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX256VL-NEXT: vptestmd %zmm1, %zmm1, %k1
+; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1
+; AVX256VL-NEXT: kshiftrw $8, %k1, %k1
; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX256VL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX256VL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX256VL-NEXT: vpacksswb %xmm0, %xmm1, %xmm0
; AVX256VL-NEXT: vzeroupper
; AVX256VL-NEXT: retq
;
@@ -135,14 +132,12 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
; AVX256VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX256VL-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX256VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX256VL-NEXT: vpmovsxbd %xmm1, %ymm1
-; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k2
-; AVX256VL-NEXT: vpmovsxbd %xmm0, %ymm0
-; AVX256VL-NEXT: vptestmd %ymm0, %ymm0, %k3
+; AVX256VL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX256VL-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX256VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k3} {z}
+; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1
+; AVX256VL-NEXT: kshiftrw $8, %k2, %k2
; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm2 {%k2} {z}
; AVX256VL-NEXT: vpmovdw %ymm2, %xmm2
; AVX256VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
@@ -153,20 +148,15 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
; AVX256VL-NEXT: vpmovdw %ymm2, %xmm2
; AVX256VL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1]
; AVX256VL-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ~mem) | ymm1
-; AVX256VL-NEXT: vpmovsxwd %xmm2, %ymm1
-; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1
-; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX256VL-NEXT: vextracti128 $1, %ymm2, %xmm1
-; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm1
-; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1
-; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k0
-; AVX256VL-NEXT: kunpckbw %k1, %k0, %k0
-; AVX256VL-NEXT: kshiftrw $8, %k0, %k2
-; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
+; AVX256VL-NEXT: vpmovsxwd %ymm2, %zmm1
+; AVX256VL-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX256VL-NEXT: vptestmd %zmm1, %zmm1, %k1
+; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1
+; AVX256VL-NEXT: kshiftrw $8, %k1, %k1
; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX256VL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX256VL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX256VL-NEXT: vpacksswb %xmm0, %xmm1, %xmm0
; AVX256VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX256VL-NEXT: retq
;
More information about the llvm-commits
mailing list