[llvm] 3c24326 - [X86] Remove combineVectorSignBitsTruncation and leave TRUNCATE -> PACKSS/PACKUS to legalization/lowering
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 17 04:23:50 PDT 2023
Author: Simon Pilgrim
Date: 2023-08-17T12:23:29+01:00
New Revision: 3c2432690ad2bcc7a4d186f94dad2efcdee55cef
URL: https://github.com/llvm/llvm-project/commit/3c2432690ad2bcc7a4d186f94dad2efcdee55cef
DIFF: https://github.com/llvm/llvm-project/commit/3c2432690ad2bcc7a4d186f94dad2efcdee55cef.diff
LOG: [X86] Remove combineVectorSignBitsTruncation and leave TRUNCATE -> PACKSS/PACKUS to legalization/lowering
Don't prematurely fold TRUNCATE nodes to PACKSS/PACKUS target nodes - we miss out on generic TRUNCATE folds.
Helps some regressions from D152928 and #63946
Fixes #63710
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/movmsk-cmp.ll
llvm/test/CodeGen/X86/pmulh.ll
llvm/test/CodeGen/X86/psubus.ll
llvm/test/CodeGen/X86/sext-vsetcc.ll
llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
llvm/test/CodeGen/X86/vector-trunc-packus.ll
llvm/test/CodeGen/X86/vector-trunc-ssat.ll
llvm/test/CodeGen/X86/vector-trunc-usat.ll
llvm/test/CodeGen/X86/vselect-avx.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4ff97905fe00ee..7b2d1be1e418e5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -20014,12 +20014,14 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
}
// Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
+ // On pre-AVX512, pack the src in both halves to help value tracking.
if (SrcSizeInBits <= 128) {
InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
- In = DAG.getBitcast(InVT, In);
- SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
+ SDValue LHS = DAG.getBitcast(InVT, In);
+ SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
+ SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
Res = DAG.getBitcast(PackedVT, Res);
return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
@@ -50844,46 +50846,6 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// This function transforms vector truncation of 'extended sign-bits' or
-/// 'extended zero-bits' values.
-/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
-/// TODO: Remove this and just use LowerTruncateVecPackWithSignBits.
-static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- // Requires SSE2.
- if (!Subtarget.hasSSE2())
- return SDValue();
-
- if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
- return SDValue();
-
- SDValue In = N->getOperand(0);
- if (!In.getValueType().isSimple())
- return SDValue();
-
- MVT VT = N->getValueType(0).getSimpleVT();
- MVT InVT = In.getValueType().getSimpleVT();
-
- // AVX512 has fast truncate, but if the input is already going to be split,
- // there's no harm in trying pack.
- if (Subtarget.hasAVX512() &&
- !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
- InVT.is512BitVector())) {
- // PACK should still be worth it for 128-bit vectors if the sources were
- // originally concatenated from subvectors.
- if (VT.getSizeInBits() > 128 || !isFreeToSplitVector(In.getNode(), DAG))
- return SDValue();
- }
-
- unsigned PackOpcode;
- if (SDValue Src =
- matchTruncateWithPACK(PackOpcode, VT, In, DL, DAG, Subtarget))
- return truncateVectorWithPACK(PackOpcode, VT, Src, DL, DAG, Subtarget);
-
- return SDValue();
-}
-
// Try to form a MULHU or MULHS node by looking for
// (trunc (srl (mul ext, ext), 16))
// TODO: This is X86 specific because we want to be able to handle wide types
@@ -51140,10 +51102,6 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
}
- // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
- if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
- return V;
-
return SDValue();
}
diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll
index 95db89049b05a7..88d819d6106ea9 100644
--- a/llvm/test/CodeGen/X86/movmsk-cmp.ll
+++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll
@@ -422,25 +422,15 @@ define i1 @allzeros_v16i16_sign(<16 x i16> %arg) {
}
define i1 @allones_v32i16_sign(<32 x i16> %arg) {
-; SSE2-LABEL: allones_v32i16_sign:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: packsswb %xmm1, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; SSE2-NEXT: sete %al
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: allones_v32i16_sign:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmaxsw %xmm3, %xmm1
-; SSE41-NEXT: pmaxsw %xmm2, %xmm0
-; SSE41-NEXT: packsswb %xmm1, %xmm0
-; SSE41-NEXT: pmovmskb %xmm0, %eax
-; SSE41-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; SSE41-NEXT: sete %al
-; SSE41-NEXT: retq
+; SSE-LABEL: allones_v32i16_sign:
+; SSE: # %bb.0:
+; SSE-NEXT: packsswb %xmm1, %xmm0
+; SSE-NEXT: packsswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm0, %xmm2
+; SSE-NEXT: pmovmskb %xmm2, %eax
+; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; SSE-NEXT: sete %al
+; SSE-NEXT: retq
;
; AVX1-LABEL: allones_v32i16_sign:
; AVX1: # %bb.0:
@@ -496,25 +486,15 @@ define i1 @allones_v32i16_sign(<32 x i16> %arg) {
}
define i1 @allzeros_v32i16_sign(<32 x i16> %arg) {
-; SSE2-LABEL: allzeros_v32i16_sign:
-; SSE2: # %bb.0:
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: packsswb %xmm1, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: sete %al
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: allzeros_v32i16_sign:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pminsw %xmm3, %xmm1
-; SSE41-NEXT: pminsw %xmm2, %xmm0
-; SSE41-NEXT: packsswb %xmm1, %xmm0
-; SSE41-NEXT: pmovmskb %xmm0, %eax
-; SSE41-NEXT: testl %eax, %eax
-; SSE41-NEXT: sete %al
-; SSE41-NEXT: retq
+; SSE-LABEL: allzeros_v32i16_sign:
+; SSE: # %bb.0:
+; SSE-NEXT: packsswb %xmm3, %xmm2
+; SSE-NEXT: packsswb %xmm1, %xmm0
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: testl %eax, %eax
+; SSE-NEXT: sete %al
+; SSE-NEXT: retq
;
; AVX1-LABEL: allzeros_v32i16_sign:
; AVX1: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll
index bd01c2f22ef07f..1110146d3cda8c 100644
--- a/llvm/test/CodeGen/X86/pmulh.ll
+++ b/llvm/test/CodeGen/X86/pmulh.ll
@@ -310,41 +310,41 @@ define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: and_mulhuw_v16i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767]
-; SSE2-NEXT: pand %xmm8, %xmm1
-; SSE2-NEXT: pand %xmm8, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: pand %xmm8, %xmm3
; SSE2-NEXT: pand %xmm8, %xmm2
; SSE2-NEXT: packssdw %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm8, %xmm5
-; SSE2-NEXT: pand %xmm8, %xmm4
-; SSE2-NEXT: packssdw %xmm5, %xmm4
-; SSE2-NEXT: pmulhw %xmm4, %xmm0
+; SSE2-NEXT: pand %xmm8, %xmm1
+; SSE2-NEXT: pand %xmm8, %xmm0
+; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: pand %xmm8, %xmm7
-; SSE2-NEXT: pand %xmm6, %xmm8
-; SSE2-NEXT: packssdw %xmm7, %xmm8
-; SSE2-NEXT: pmulhw %xmm2, %xmm8
-; SSE2-NEXT: movdqa %xmm8, %xmm1
+; SSE2-NEXT: pand %xmm8, %xmm6
+; SSE2-NEXT: packssdw %xmm7, %xmm6
+; SSE2-NEXT: pmulhw %xmm2, %xmm6
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm8
+; SSE2-NEXT: packssdw %xmm5, %xmm8
+; SSE2-NEXT: pmulhw %xmm8, %xmm0
+; SSE2-NEXT: movdqa %xmm6, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: and_mulhuw_v16i16:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767]
-; SSE41-NEXT: pand %xmm8, %xmm1
-; SSE41-NEXT: pand %xmm8, %xmm0
-; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: pand %xmm8, %xmm3
; SSE41-NEXT: pand %xmm8, %xmm2
; SSE41-NEXT: packusdw %xmm3, %xmm2
-; SSE41-NEXT: pand %xmm8, %xmm5
-; SSE41-NEXT: pand %xmm8, %xmm4
-; SSE41-NEXT: packusdw %xmm5, %xmm4
-; SSE41-NEXT: pmulhw %xmm4, %xmm0
+; SSE41-NEXT: pand %xmm8, %xmm1
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: pand %xmm8, %xmm7
-; SSE41-NEXT: pand %xmm6, %xmm8
-; SSE41-NEXT: packusdw %xmm7, %xmm8
-; SSE41-NEXT: pmulhw %xmm2, %xmm8
-; SSE41-NEXT: movdqa %xmm8, %xmm1
+; SSE41-NEXT: pand %xmm8, %xmm6
+; SSE41-NEXT: packusdw %xmm7, %xmm6
+; SSE41-NEXT: pmulhw %xmm2, %xmm6
+; SSE41-NEXT: pand %xmm8, %xmm5
+; SSE41-NEXT: pand %xmm4, %xmm8
+; SSE41-NEXT: packusdw %xmm5, %xmm8
+; SSE41-NEXT: pmulhw %xmm8, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm1
; SSE41-NEXT: retq
;
; AVX2-LABEL: and_mulhuw_v16i16:
@@ -408,13 +408,6 @@ define <16 x i16> @sext_mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) {
define <16 x i16> @ashr_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: ashr_mulhuw_v16i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: psrad $16, %xmm7
-; SSE2-NEXT: psrad $16, %xmm6
-; SSE2-NEXT: packssdw %xmm7, %xmm6
-; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: psrad $16, %xmm2
-; SSE2-NEXT: packssdw %xmm3, %xmm2
-; SSE2-NEXT: pmulhw %xmm6, %xmm2
; SSE2-NEXT: psrad $16, %xmm5
; SSE2-NEXT: psrad $16, %xmm4
; SSE2-NEXT: packssdw %xmm5, %xmm4
@@ -422,25 +415,32 @@ define <16 x i16> @ashr_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: pmulhw %xmm4, %xmm0
+; SSE2-NEXT: psrad $16, %xmm7
+; SSE2-NEXT: psrad $16, %xmm6
+; SSE2-NEXT: packssdw %xmm7, %xmm6
+; SSE2-NEXT: psrad $16, %xmm3
+; SSE2-NEXT: psrad $16, %xmm2
+; SSE2-NEXT: packssdw %xmm3, %xmm2
+; SSE2-NEXT: pmulhw %xmm6, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: ashr_mulhuw_v16i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: psrld $16, %xmm3
; SSE41-NEXT: psrld $16, %xmm2
; SSE41-NEXT: packusdw %xmm3, %xmm2
-; SSE41-NEXT: psrld $16, %xmm5
-; SSE41-NEXT: psrld $16, %xmm4
-; SSE41-NEXT: packusdw %xmm5, %xmm4
-; SSE41-NEXT: pmulhw %xmm4, %xmm0
+; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: psrld $16, %xmm7
; SSE41-NEXT: psrld $16, %xmm6
; SSE41-NEXT: packusdw %xmm7, %xmm6
; SSE41-NEXT: pmulhw %xmm2, %xmm6
+; SSE41-NEXT: psrld $16, %xmm5
+; SSE41-NEXT: psrld $16, %xmm4
+; SSE41-NEXT: packusdw %xmm5, %xmm4
+; SSE41-NEXT: pmulhw %xmm4, %xmm0
; SSE41-NEXT: movdqa %xmm6, %xmm1
; SSE41-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index 430e1ce7289011..7858d125b9da49 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -1872,13 +1872,13 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind {
; SSE41-LABEL: psubus_16i32_max:
; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535]
-; SSE41-NEXT: pminud %xmm6, %xmm5
-; SSE41-NEXT: pminud %xmm6, %xmm4
-; SSE41-NEXT: packusdw %xmm5, %xmm4
; SSE41-NEXT: pminud %xmm6, %xmm3
; SSE41-NEXT: pminud %xmm6, %xmm2
; SSE41-NEXT: packusdw %xmm3, %xmm2
; SSE41-NEXT: psubusw %xmm2, %xmm0
+; SSE41-NEXT: pminud %xmm6, %xmm5
+; SSE41-NEXT: pminud %xmm6, %xmm4
+; SSE41-NEXT: packusdw %xmm5, %xmm4
; SSE41-NEXT: psubusw %xmm4, %xmm1
; SSE41-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/sext-vsetcc.ll b/llvm/test/CodeGen/X86/sext-vsetcc.ll
index f42bac9eacca85..ae0b010a1e5943 100644
--- a/llvm/test/CodeGen/X86/sext-vsetcc.ll
+++ b/llvm/test/CodeGen/X86/sext-vsetcc.ll
@@ -638,32 +638,17 @@ define <8 x i32> @PR63946(<8 x i32> %a0, <8 x i32> %b0) nounwind {
; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm2
; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpcmpeqd %ymm0, %ymm3, %ymm2
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT: vpcmpeqd %ymm0, %ymm4, %ymm4
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm9
-; AVX2-NEXT: vpcmpeqd %ymm0, %ymm5, %ymm5
-; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm10
-; AVX2-NEXT: vpor %xmm3, %xmm10, %xmm3
-; AVX2-NEXT: vpcmpeqd %ymm0, %ymm6, %ymm6
-; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm10
-; AVX2-NEXT: vpor %xmm10, %xmm9, %xmm9
-; AVX2-NEXT: vpcmpeqd %ymm0, %ymm7, %ymm7
-; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm10
-; AVX2-NEXT: vpackssdw %xmm10, %xmm7, %xmm7
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm4, %ymm3
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm5, %ymm4
+; AVX2-NEXT: vpor %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm6, %ymm2
+; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm7, %ymm2
; AVX2-NEXT: vpcmpeqd %ymm0, %ymm8, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm8
-; AVX2-NEXT: vpackssdw %xmm8, %xmm0, %xmm0
-; AVX2-NEXT: vpor %xmm0, %xmm7, %xmm0
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm7
-; AVX2-NEXT: vpor %xmm3, %xmm7, %xmm3
-; AVX2-NEXT: vpor %xmm3, %xmm9, %xmm3
-; AVX2-NEXT: vpor %xmm5, %xmm2, %xmm2
-; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpor %xmm6, %xmm4, %xmm2
-; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
index 8b70daee1d60bc..565946d342e935 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
@@ -319,13 +319,13 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: vpextrb $8, %xmm0, %edx
-; AVX1-NEXT: vpextrb $0, %xmm1, %ecx
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: vpextrb $4, %xmm0, %edx
+; AVX1-NEXT: vpextrb $8, %xmm0, %ecx
; AVX1-NEXT: # kill: def $al killed $al killed $eax
; AVX1-NEXT: # kill: def $dl killed $dl killed $edx
; AVX1-NEXT: # kill: def $cl killed $cl killed $ecx
@@ -375,11 +375,12 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: vpextrb $8, %xmm0, %edx
-; AVX2-NEXT: vpextrb $0, %xmm1, %ecx
+; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: notl %eax
+; AVX2-NEXT: vpextrb $8, %xmm1, %edx
+; AVX2-NEXT: vpextrb $0, %xmm2, %ecx
; AVX2-NEXT: # kill: def $al killed $al killed $eax
; AVX2-NEXT: # kill: def $dl killed $dl killed $edx
; AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
index 22bb9dacc7c037..49cd4d20d166a7 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
@@ -1211,9 +1211,8 @@ define i8 @icmp0_v8i1(<8 x i8>) nounwind {
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1OR2-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpsraw $15, %xmm0, %xmm0
; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax
-; AVX1OR2-NEXT: testl %eax, %eax
+; AVX1OR2-NEXT: testl $43690, %eax # imm = 0xAAAA
; AVX1OR2-NEXT: sete %al
; AVX1OR2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
index d6b200a1e268a6..a94104a002d5ce 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
@@ -4092,7 +4092,7 @@ define <4 x i8> @trunc_packus_v4i32_v4i8(<4 x i32> %a0) "min-legal-vector-width"
; AVX2-FAST-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-FAST-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_packus_v4i32_v4i8:
diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
index 1f8572f6c1b897..fbf9187df4817e 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
@@ -3827,7 +3827,7 @@ define <4 x i8> @trunc_ssat_v4i32_v4i8(<4 x i32> %a0) {
; AVX2-FAST-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
; AVX2-FAST-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_ssat_v4i32_v4i8:
diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
index 12803e2df57192..a0909e370097d2 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
@@ -2940,7 +2940,7 @@ define <4 x i8> @trunc_usat_v4i32_v4i8(<4 x i32> %a0) {
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
; AVX2-FAST-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_usat_v4i32_v4i8:
diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll
index 6ba205765490d3..5040232fadad25 100644
--- a/llvm/test/CodeGen/X86/vselect-avx.ll
+++ b/llvm/test/CodeGen/X86/vselect-avx.ll
@@ -101,9 +101,6 @@ define void @test3(<4 x i32> %induction30, ptr %tmp16, ptr %tmp17, <4 x i16> %t
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
; AVX1-NEXT: vmovq %xmm0, (%rdi)
; AVX1-NEXT: vmovq %xmm1, (%rsi)
; AVX1-NEXT: retq
@@ -119,9 +116,6 @@ define void @test3(<4 x i32> %induction30, ptr %tmp16, ptr %tmp17, <4 x i16> %t
; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
; AVX2-NEXT: vmovq %xmm0, (%rdi)
; AVX2-NEXT: vmovq %xmm1, (%rsi)
; AVX2-NEXT: retq
More information about the llvm-commits
mailing list