[llvm] 6cf8bde - [X86] getFauxShuffleMask - add SIGN_EXTEND_VECTOR_INREG handling for all-signbits sources
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 19 06:32:50 PDT 2023
Author: Simon Pilgrim
Date: 2023-07-19T14:32:34+01:00
New Revision: 6cf8bde056f3ff58bde54dbe3b320d81ad5d66d6
URL: https://github.com/llvm/llvm-project/commit/6cf8bde056f3ff58bde54dbe3b320d81ad5d66d6
DIFF: https://github.com/llvm/llvm-project/commit/6cf8bde056f3ff58bde54dbe3b320d81ad5d66d6.diff
LOG: [X86] getFauxShuffleMask - add SIGN_EXTEND_VECTOR_INREG handling for all-signbits sources
Add suport for shuffle combines (via combineEXTEND_VECTOR_INREG) to begin from SIGN_EXTEND_VECTOR_INREG nodes
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll
llvm/test/CodeGen/X86/masked_load.ll
llvm/test/CodeGen/X86/pr15267.ll
llvm/test/CodeGen/X86/pr49076.ll
llvm/test/CodeGen/X86/promote-cmp.ll
llvm/test/CodeGen/X86/vec_saddo.ll
llvm/test/CodeGen/X86/vec_smulo.ll
llvm/test/CodeGen/X86/vec_ssubo.ll
llvm/test/CodeGen/X86/vec_uaddo.ll
llvm/test/CodeGen/X86/vec_umulo.ll
llvm/test/CodeGen/X86/vec_usubo.ll
llvm/test/CodeGen/X86/vector-sext.ll
llvm/test/CodeGen/X86/vsel-cmp-load.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6de15dbfc5e216..abe9f29bebd400 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8825,6 +8825,29 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
Mask.append(NumElts, 0);
return true;
}
+ case ISD::SIGN_EXTEND_VECTOR_INREG: {
+ SDValue Src = N.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
+
+ // Extended source must be a simple vector.
+ if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
+ (NumBitsPerSrcElt % 8) != 0)
+ return false;
+
+ // We can only handle all-signbits extensions.
+ APInt DemandedSrcElts =
+ DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
+ if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
+ return false;
+
+ assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
+ unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
+ for (unsigned I = 0; I != NumElts; ++I)
+ Mask.append(Scale, I);
+ Ops.push_back(Src);
+ return true;
+ }
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND_VECTOR_INREG:
@@ -58086,9 +58109,7 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
}
// Attempt to combine as a shuffle on SSE41+ targets.
- if ((Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
- Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) &&
- Subtarget.hasSSE41()) {
+ if (Subtarget.hasSSE41()) {
SDValue Op(N, 0);
if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll
index d0653bcfb29b61..921cf885185620 100644
--- a/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll
+++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll
@@ -539,8 +539,6 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm2
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-SSSE3-NEXT: packsswb %xmm2, %xmm2
; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %eax
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
; SSE2-SSSE3-NEXT: retq
@@ -550,8 +548,6 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpmovsxbw %xmm0, %xmm0
-; AVX12-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX12-NEXT: vpmovmskb %xmm0, %eax
; AVX12-NEXT: # kill: def $al killed $al killed $eax
; AVX12-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index e12cfff24bf88f..7e47aaba874719 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -180,8 +180,7 @@ define <4 x double> @load_v4f64_v4i32(<4 x i32> %trigger, ptr %addr, <4 x double
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
@@ -266,8 +265,7 @@ define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, ptr %addr) {
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/pr15267.ll b/llvm/test/CodeGen/X86/pr15267.ll
index 9750ddf39a0009..5083eac71dce0c 100644
--- a/llvm/test/CodeGen/X86/pr15267.ll
+++ b/llvm/test/CodeGen/X86/pr15267.ll
@@ -71,8 +71,7 @@ define <4 x i64> @test3(ptr %in) nounwind {
; CHECK-NEXT: negl %eax
; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%wide.load35 = load <4 x i1>, ptr %in, align 1
diff --git a/llvm/test/CodeGen/X86/pr49076.ll b/llvm/test/CodeGen/X86/pr49076.ll
index 3ab20b71608d00..38235b8d2dbc42 100644
--- a/llvm/test/CodeGen/X86/pr49076.ll
+++ b/llvm/test/CodeGen/X86/pr49076.ll
@@ -14,8 +14,7 @@ define void @foo() {
; CHECK-NEXT: vpmovsxdq %xmm1, %xmm2
; CHECK-NEXT: # implicit-def: $ymm0
; CHECK-NEXT: vmovaps %xmm2, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; CHECK-NEXT: vpmovsxdq %xmm1, %xmm1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; CHECK-NEXT: # %bb.2: # %BB1
; CHECK-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/promote-cmp.ll b/llvm/test/CodeGen/X86/promote-cmp.ll
index 614d86bd4c7942..88934a382bbfad 100644
--- a/llvm/test/CodeGen/X86/promote-cmp.ll
+++ b/llvm/test/CodeGen/X86/promote-cmp.ll
@@ -46,18 +46,14 @@ define <4 x i64> @PR45808(<4 x i64> %0, <4 x i64> %1) {
; SSE4-LABEL: PR45808:
; SSE4: # %bb.0:
; SSE4-NEXT: movdqa %xmm0, %xmm4
+; SSE4-NEXT: movdqa %xmm0, %xmm5
+; SSE4-NEXT: pcmpgtq %xmm2, %xmm5
; SSE4-NEXT: movdqa %xmm1, %xmm0
; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT: movdqa %xmm4, %xmm5
-; SSE4-NEXT: pcmpgtq %xmm2, %xmm5
-; SSE4-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; SSE4-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE4-NEXT: pxor %xmm5, %xmm6
-; SSE4-NEXT: psllq $63, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE4-NEXT: pmovsxdq %xmm6, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2
-; SSE4-NEXT: movapd %xmm2, %xmm0
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm4
+; SSE4-NEXT: movapd %xmm4, %xmm0
; SSE4-NEXT: movapd %xmm3, %xmm1
; SSE4-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll
index c4ef778ad89cb6..7631367ba5d667 100644
--- a/llvm/test/CodeGen/X86/vec_saddo.ll
+++ b/llvm/test/CodeGen/X86/vec_saddo.ll
@@ -503,12 +503,9 @@ define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm0, %ymm4, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
; AVX2-NEXT: retq
@@ -720,8 +717,7 @@ define <8 x i32> @saddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index b16c2d124164ce..88e4b11b66f541 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -1213,13 +1213,10 @@ define <16 x i32> @smulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpsrad $31, %ymm2, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm0, %ymm5, %ymm0
-; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
+; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
; AVX2-NEXT: retq
@@ -2748,8 +2745,7 @@ define <8 x i32> @smulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll
index 044f13162f28e6..d634457069c0da 100644
--- a/llvm/test/CodeGen/X86/vec_ssubo.ll
+++ b/llvm/test/CodeGen/X86/vec_ssubo.ll
@@ -508,12 +508,9 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm0, %ymm4, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
; AVX2-NEXT: retq
@@ -725,8 +722,7 @@ define <8 x i32> @ssubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index 38f46430c9e772..657e975a69440c 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -592,13 +592,10 @@ define <16 x i32> @uaddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpmaxud %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
+; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
; AVX2-NEXT: retq
@@ -802,8 +799,7 @@ define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index a534df6aa14c5b..b919bdb40c5f18 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -1008,15 +1008,12 @@ define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,1,3,3,5,5,7,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7]
; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5
-; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
-; AVX2-NEXT: vpacksswb %xmm5, %xmm5, %xmm5
; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vpmovsxbd %xmm5, %ymm0
-; AVX2-NEXT: vpacksswb %xmm4, %xmm4, %xmm1
-; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
+; AVX2-NEXT: vpacksswb %xmm4, %xmm4, %xmm0
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
+; AVX2-NEXT: vpxor %ymm6, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
; AVX2-NEXT: retq
@@ -2408,8 +2405,7 @@ define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll
index bf7acba7456834..df5da63b503599 100644
--- a/llvm/test/CodeGen/X86/vec_usubo.ll
+++ b/llvm/test/CodeGen/X86/vec_usubo.ll
@@ -635,13 +635,10 @@ define <16 x i32> @usubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpminud %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
+; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
; AVX2-NEXT: retq
@@ -847,8 +844,7 @@ define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll
index 79d2fed813f925..eae1b1b23bcea6 100644
--- a/llvm/test/CodeGen/X86/vector-sext.ll
+++ b/llvm/test/CodeGen/X86/vector-sext.ll
@@ -1863,8 +1863,7 @@ define <4 x i64> @load_sext_4i1_to_4i64(ptr%ptr) {
; AVX1-NEXT: negl %eax
; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -3218,8 +3217,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE41-NEXT: pmovsxdq %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
@@ -3228,8 +3226,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -3261,8 +3258,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
; X86-SSE41-NEXT: pslld $31, %xmm0
; X86-SSE41-NEXT: psrad $31, %xmm0
; X86-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
-; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X86-SSE41-NEXT: pmovsxdq %xmm0, %xmm1
+; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
; X86-SSE41-NEXT: movdqa %xmm2, %xmm0
; X86-SSE41-NEXT: retl
%extmask = sext <4 x i1> %mask to <4 x i64>
diff --git a/llvm/test/CodeGen/X86/vsel-cmp-load.ll b/llvm/test/CodeGen/X86/vsel-cmp-load.ll
index 5a348c24372be8..bec0b60d95e826 100644
--- a/llvm/test/CodeGen/X86/vsel-cmp-load.ll
+++ b/llvm/test/CodeGen/X86/vsel-cmp-load.ll
@@ -80,8 +80,7 @@ define <16 x i16> @sgt_zero(ptr %p, <16 x i16> %x, <16 x i16> %y) {
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovsxbw %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
More information about the llvm-commits
mailing list