[llvm] 8cefc37 - [DAGCombine] visitEXTRACT_SUBVECTOR - 'little to big' extract_subvector(bitcast()) support
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 23 07:21:49 PST 2019
Author: Sanjay Patel
Date: 2019-12-23T10:11:45-05:00
New Revision: 8cefc37be5aba4948936c7beb97cde7a68449f1f
URL: https://github.com/llvm/llvm-project/commit/8cefc37be5aba4948936c7beb97cde7a68449f1f
DIFF: https://github.com/llvm/llvm-project/commit/8cefc37be5aba4948936c7beb97cde7a68449f1f.diff
LOG: [DAGCombine] visitEXTRACT_SUBVECTOR - 'little to big' extract_subvector(bitcast()) support
This moves the X86 specific transform from rL364407
into DAGCombiner to generically handle 'little to big' cases
(for example: extract_subvector(v2i64 bitcast(v16i8))). This
allows us to remove both the x86 implementation and the aarch64
bitcast(extract_subvector(bitcast())) combine.
Earlier patches that dealt with regressions initially exposed
by this patch:
rG5e5e99c041e4
rG0b38af89e2c0
Patch by: @RKSimon (Simon Pilgrim)
Differential Revision: https://reviews.llvm.org/D63815
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/AArch64/merge-store.ll
llvm/test/CodeGen/ARM/combine-vmovdrr.ll
llvm/test/CodeGen/X86/avg-mask.ll
llvm/test/CodeGen/X86/madd.ll
llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1ae2f58415fa..02ae11d8a002 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18515,7 +18515,23 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
return DAG.getBitcast(NVT, NewExtract);
}
}
- // TODO - handle (DestNumElts % SrcNumElts) == 0
+ if ((DestNumElts % SrcNumElts) == 0) {
+ unsigned DestSrcRatio = DestNumElts / SrcNumElts;
+ if ((NVT.getVectorNumElements() % DestSrcRatio) == 0) {
+ unsigned NewExtNumElts = NVT.getVectorNumElements() / DestSrcRatio;
+ EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),
+ SrcVT.getScalarType(), NewExtNumElts);
+ if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&
+ TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
+ unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;
+ SDLoc DL(N);
+ SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
+ SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
+ V.getOperand(0), NewIndex);
+ return DAG.getBitcast(NVT, NewExtract);
+ }
+ }
+ }
}
// Combine:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9ae84f51407c..c5ea8e0f9fff 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -618,7 +618,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
- setTargetDAGCombine(ISD::BITCAST);
setTargetDAGCombine(ISD::CONCAT_VECTORS);
setTargetDAGCombine(ISD::STORE);
if (Subtarget->supportsAddressTopByteIgnored())
@@ -10185,74 +10184,6 @@ static SDValue performSRLCombine(SDNode *N,
return SDValue();
}
-static SDValue performBitcastCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- SelectionDAG &DAG) {
- // Wait 'til after everything is legalized to try this. That way we have
- // legal vector types and such.
- if (DCI.isBeforeLegalizeOps())
- return SDValue();
-
- // Remove extraneous bitcasts around an extract_subvector.
- // For example,
- // (v4i16 (bitconvert
- // (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
- // becomes
- // (extract_subvector ((v8i16 ...), (i64 4)))
-
- // Only interested in 64-bit vectors as the ultimate result.
- EVT VT = N->getValueType(0);
- if (!VT.isVector() || VT.isScalableVector())
- return SDValue();
- if (VT.getSimpleVT().getSizeInBits() != 64)
- return SDValue();
- // Is the operand an extract_subvector starting at the beginning or halfway
- // point of the vector? A low half may also come through as an
- // EXTRACT_SUBREG, so look for that, too.
- SDValue Op0 = N->getOperand(0);
- if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
- !(Op0->isMachineOpcode() &&
- Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
- return SDValue();
- uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
- if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
- if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
- return SDValue();
- } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
- if (idx != AArch64::dsub)
- return SDValue();
- // The dsub reference is equivalent to a lane zero subvector reference.
- idx = 0;
- }
- // Look through the bitcast of the input to the extract.
- if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
- return SDValue();
- SDValue Source = Op0->getOperand(0)->getOperand(0);
- // If the source type has twice the number of elements as our destination
- // type, we know this is an extract of the high or low half of the vector.
- EVT SVT = Source->getValueType(0);
- if (!SVT.isVector() ||
- SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
- return SDValue();
-
- LLVM_DEBUG(
- dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
-
- // Create the simplified form to just extract the low or high half of the
- // vector directly rather than bothering with the bitcasts.
- SDLoc dl(N);
- unsigned NumElements = VT.getVectorNumElements();
- if (idx) {
- SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
- } else {
- SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32);
- return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
- Source, SubReg),
- 0);
- }
-}
-
static SDValue performConcatVectorsCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
@@ -12453,8 +12384,6 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performExtendCombine(N, DCI, DAG);
case ISD::SIGN_EXTEND_INREG:
return performSignExtendInRegCombine(N, DCI, DAG);
- case ISD::BITCAST:
- return performBitcastCombine(N, DCI, DAG);
case ISD::CONCAT_VECTORS:
return performConcatVectorsCombine(N, DCI, DAG);
case ISD::SELECT:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 54a64052ebb6..cd6821c16e8b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45103,7 +45103,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
SDValue InVec = N->getOperand(0);
SDValue InVecBC = peekThroughBitcasts(InVec);
EVT InVecVT = InVec.getValueType();
- EVT InVecBCVT = InVecBC.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
@@ -45147,31 +45146,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
VT, SDLoc(N),
InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
- // Try to move vector bitcast after extract_subv by scaling extraction index:
- // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
- // TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR
- if (InVec != InVecBC && InVecBCVT.isVector()) {
- unsigned SrcNumElts = InVecBCVT.getVectorNumElements();
- unsigned DestNumElts = InVecVT.getVectorNumElements();
- if ((DestNumElts % SrcNumElts) == 0) {
- unsigned DestSrcRatio = DestNumElts / SrcNumElts;
- if ((VT.getVectorNumElements() % DestSrcRatio) == 0) {
- unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio;
- EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),
- InVecBCVT.getScalarType(), NewExtNumElts);
- if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&
- TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
- unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;
- SDLoc DL(N);
- SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
- SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
- InVecBC, NewIndex);
- return DAG.getBitcast(VT, NewExtract);
- }
- }
- }
- }
-
// If we are extracting from an insert into a zero vector, replace with a
// smaller insert into zero if we don't access less than the original
// subvector. Don't do this for i1 vectors.
diff --git a/llvm/test/CodeGen/AArch64/merge-store.ll b/llvm/test/CodeGen/AArch64/merge-store.ll
index cd9564210e9a..f0a53384cdd8 100644
--- a/llvm/test/CodeGen/AArch64/merge-store.ll
+++ b/llvm/test/CodeGen/AArch64/merge-store.ll
@@ -42,17 +42,10 @@ define void @blam() {
; the fastness of unaligned accesses was not specified correctly.
define void @merge_vec_extract_stores(<4 x float> %v1, <2 x float>* %ptr) {
-; SPLITTING-LABEL: merge_vec_extract_stores:
-; SPLITTING: // %bb.0:
-; SPLITTING-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; SPLITTING-NEXT: str d0, [x0, #24]
-; SPLITTING-NEXT: str d1, [x0, #32]
-; SPLITTING-NEXT: ret
-;
-; MISALIGNED-LABEL: merge_vec_extract_stores:
-; MISALIGNED: // %bb.0:
-; MISALIGNED-NEXT: stur q0, [x0, #24]
-; MISALIGNED-NEXT: ret
+; CHECK-LABEL: merge_vec_extract_stores:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stur q0, [x0, #24]
+; CHECK-NEXT: ret
%idx0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3
%idx1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 4
@@ -62,9 +55,4 @@ define void @merge_vec_extract_stores(<4 x float> %v1, <2 x float>* %ptr) {
store <2 x float> %shuffle0, <2 x float>* %idx0, align 8
store <2 x float> %shuffle1, <2 x float>* %idx1, align 8
ret void
-
-
-; FIXME: Ideally we would like to use a generic target for this test, but this relies
-; on suppressing store pairs.
-
}
diff --git a/llvm/test/CodeGen/ARM/combine-vmovdrr.ll b/llvm/test/CodeGen/ARM/combine-vmovdrr.ll
index 358f7e3a983e..01526b371990 100644
--- a/llvm/test/CodeGen/ARM/combine-vmovdrr.ll
+++ b/llvm/test/CodeGen/ARM/combine-vmovdrr.ll
@@ -9,8 +9,8 @@ declare <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %shuffle.i.i307, <8 x i8> %shuffl
; they are defined on VPRs and used on VPRs.
;
; CHECK-LABEL: motivatingExample:
-; CHECK: vldr [[ARG2_VAL:d[0-9]+]], [r1]
-; CHECK-NEXT: vld1.32 {[[ARG1_VALlo:d[0-9]+]], [[ARG1_VALhi:d[0-9]+]]}, [r0]
+; CHECK: vld1.32 {[[ARG1_VALlo:d[0-9]+]], [[ARG1_VALhi:d[0-9]+]]}, [r0]
+; CHECK-NEXT: vldr [[ARG2_VAL:d[0-9]+]], [r1]
; CHECK-NEXT: vtbl.8 [[RES:d[0-9]+]], {[[ARG1_VALlo]], [[ARG1_VALhi]]}, [[ARG2_VAL]]
; CHECK-NEXT: vstr [[RES]], [r1]
; CHECK-NEXT: bx lr
diff --git a/llvm/test/CodeGen/X86/avg-mask.ll b/llvm/test/CodeGen/X86/avg-mask.ll
index a7ce07ab0cd9..ebe9e75ab4f3 100644
--- a/llvm/test/CodeGen/X86/avg-mask.ll
+++ b/llvm/test/CodeGen/X86/avg-mask.ll
@@ -130,9 +130,9 @@ define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64
; AVX512F-NEXT: shrq $32, %rdi
; AVX512F-NEXT: shrq $48, %rax
; AVX512F-NEXT: shrl $16, %ecx
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
-; AVX512F-NEXT: vpavgb %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: kmovw %ecx, %k2
; AVX512F-NEXT: kmovw %eax, %k3
@@ -142,14 +142,14 @@ define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64
; AVX512F-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z}
; AVX512F-NEXT: vpmovdb %zmm5, %xmm5
; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm1
-; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
-; AVX512F-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z}
-; AVX512F-NEXT: vpmovdb %zmm4, %xmm4
-; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v64i8_mask:
@@ -178,9 +178,9 @@ define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwin
; AVX512F-NEXT: shrq $32, %rdi
; AVX512F-NEXT: shrq $48, %rax
; AVX512F-NEXT: shrl $16, %ecx
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vpavgb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: kmovw %ecx, %k2
; AVX512F-NEXT: kmovw %eax, %k3
@@ -190,14 +190,14 @@ define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwin
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
-; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v64i8_maskz:
@@ -330,18 +330,18 @@ define <32 x i16> @avg_v32i16_mask(<32 x i16> %a, <32 x i16> %b, <32 x i16> %src
; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: shrl $16, %edi
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
-; AVX512F-NEXT: vpavgw %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: kmovw %edi, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm1
-; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
-; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v32i16_mask:
@@ -366,18 +366,18 @@ define <32 x i16> @avg_v32i16_maskz(<32 x i16> %a, <32 x i16> %b, i32 %mask) nou
; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: shrl $16, %edi
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vpavgw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: kmovw %edi, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
-; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v32i16_maskz:
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
index 62792ec074ae..11756574217e 100644
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -1975,9 +1975,9 @@ define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) {
;
; AVX512F-LABEL: pmaddwd_32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
@@ -2188,9 +2188,9 @@ define <16 x i32> @jumbled_indices16(<32 x i16> %A, <32 x i16> %B) {
;
; AVX512F-LABEL: jumbled_indices16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
index cc73f563581a..b79fb5c35f3d 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
@@ -6374,9 +6374,9 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, <32 x i8>* %p, <32 x i8> %ma
;
; AVX512F-LABEL: truncstore_v32i16_v32i8:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpacksswb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512F-NEXT: vpmovmskb %ymm1, %eax
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index 720cabee9122..7f00b49b81f8 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -725,33 +725,33 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm2
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm2
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
More information about the llvm-commits
mailing list