[llvm] 7920527 - [X86][AVX] combineBitcastvxi1 - improve handling of vectors truncated to vXi1
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 24 07:11:53 PDT 2021
Author: Simon Pilgrim
Date: 2021-03-24T14:05:59Z
New Revision: 7920527796eacd596d362ab5266e23189565892e
URL: https://github.com/llvm/llvm-project/commit/7920527796eacd596d362ab5266e23189565892e
DIFF: https://github.com/llvm/llvm-project/commit/7920527796eacd596d362ab5266e23189565892e.diff
LOG: [X86][AVX] combineBitcastvxi1 - improve handling of vectors truncated to vXi1
If we're truncating to vXi1 from a wider type, then prefer the original wider vector as is simplifies folding the separate truncations + extensions.
AVX1 this is only worth it for v8i1 cases, not v4i1 where we're always better off truncating down to v4i32 for movmsk.
Helps with some regressions encountered in D96609
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bdb09b919a39..350a1b73c13e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -39201,17 +39201,22 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
Op, DemandedBits, DemandedElts, DAG, Depth);
}
-// Helper to peek through bitops/setcc to determine size of source vector.
+// Helper to peek through bitops/trunc/setcc to determine size of source vector.
// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
-static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
+static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
+ bool AllowTruncate) {
switch (Src.getOpcode()) {
+ case ISD::TRUNCATE:
+ if (!AllowTruncate)
+ return false;
+ LLVM_FALLTHROUGH;
case ISD::SETCC:
return Src.getOperand(0).getValueSizeInBits() == Size;
case ISD::AND:
case ISD::XOR:
case ISD::OR:
- return checkBitcastSrcVectorSize(Src.getOperand(0), Size) &&
- checkBitcastSrcVectorSize(Src.getOperand(1), Size);
+ return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
+ checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
}
return false;
}
@@ -39266,6 +39271,7 @@ static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
SDValue Src, const SDLoc &DL) {
switch (Src.getOpcode()) {
case ISD::SETCC:
+ case ISD::TRUNCATE:
return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
case ISD::AND:
case ISD::XOR:
@@ -39349,7 +39355,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
SExtVT = MVT::v4i32;
// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
// sign-extend to a 256-bit operation to avoid truncation.
- if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) {
+ if (Subtarget.hasAVX() &&
+ checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
SExtVT = MVT::v4i64;
PropagateSExt = true;
}
@@ -39361,8 +39368,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
// 256-bit because the shuffle is cheaper than sign extending the result of
// the compare.
- if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256) ||
- checkBitcastSrcVectorSize(Src, 512))) {
+ if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
+ checkBitcastSrcVectorSize(Src, 512, true))) {
SExtVT = MVT::v8i32;
PropagateSExt = true;
}
@@ -39387,7 +39394,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
break;
}
// Split if this is a <64 x i8> comparison result.
- if (checkBitcastSrcVectorSize(Src, 512)) {
+ if (checkBitcastSrcVectorSize(Src, 512, false)) {
SExtVT = MVT::v64i8;
break;
}
diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
index 2958f7e96cb5..6965d2bcccc9 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
@@ -219,16 +219,25 @@ define i1 @trunc_v4i64_v4i1(<4 x i64>) {
; SSE-NEXT: sete %al
; SSE-NEXT: retq
;
-; AVX-LABEL: trunc_v4i64_v4i1:
-; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX-NEXT: vmovmskps %xmm0, %eax
-; AVX-NEXT: cmpb $15, %al
-; AVX-NEXT: sete %al
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: trunc_v4i64_v4i1:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vmovmskps %xmm0, %eax
+; AVX1-NEXT: cmpb $15, %al
+; AVX1-NEXT: sete %al
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_v4i64_v4i1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0
+; AVX2-NEXT: vmovmskpd %ymm0, %eax
+; AVX2-NEXT: cmpb $15, %al
+; AVX2-NEXT: sete %al
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_v4i1:
; AVX512F: # %bb.0:
@@ -296,14 +305,11 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) {
;
; AVX1-LABEL: trunc_v8i32_v8i1:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovmskps %ymm0, %eax
; AVX1-NEXT: cmpb $-1, %al
; AVX1-NEXT: sete %al
; AVX1-NEXT: vzeroupper
@@ -311,11 +317,8 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) {
;
; AVX2-LABEL: trunc_v8i32_v8i1:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT: vmovmskps %ymm0, %eax
; AVX2-NEXT: cmpb $-1, %al
; AVX2-NEXT: sete %al
; AVX2-NEXT: vzeroupper
@@ -536,17 +539,14 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) {
;
; AVX1-LABEL: trunc_v8i64_v8i1:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovmskps %ymm0, %eax
; AVX1-NEXT: cmpb $-1, %al
; AVX1-NEXT: sete %al
; AVX1-NEXT: vzeroupper
@@ -557,11 +557,8 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) {
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT: vmovmskps %ymm0, %eax
; AVX2-NEXT: cmpb $-1, %al
; AVX2-NEXT: sete %al
; AVX2-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
index 283939f86a94..1363f5f35a9e 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
@@ -212,16 +212,25 @@ define i1 @trunc_v4i64_v4i1(<4 x i64>) {
; SSE-NEXT: setne %al
; SSE-NEXT: retq
;
-; AVX-LABEL: trunc_v4i64_v4i1:
-; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX-NEXT: vmovmskps %xmm0, %eax
-; AVX-NEXT: testb %al, %al
-; AVX-NEXT: setne %al
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: trunc_v4i64_v4i1:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vmovmskps %xmm0, %eax
+; AVX1-NEXT: testb %al, %al
+; AVX1-NEXT: setne %al
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_v4i64_v4i1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0
+; AVX2-NEXT: vmovmskpd %ymm0, %eax
+; AVX2-NEXT: testb %al, %al
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_v4i1:
; AVX512F: # %bb.0:
@@ -285,25 +294,21 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) {
;
; AVX1-LABEL: trunc_v8i32_v8i1:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX1-NEXT: vpmovmskb %xmm0, %eax
-; AVX1-NEXT: testl $43690, %eax # imm = 0xAAAA
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovmskps %ymm0, %eax
+; AVX1-NEXT: testb %al, %al
; AVX1-NEXT: setne %al
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_v8i32_v8i1:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: testl $43690, %eax # imm = 0xAAAA
+; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT: vmovmskps %ymm0, %eax
+; AVX2-NEXT: testb %al, %al
; AVX2-NEXT: setne %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -521,17 +526,15 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) {
;
; AVX1-LABEL: trunc_v8i64_v8i1:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX1-NEXT: vpmovmskb %xmm0, %eax
-; AVX1-NEXT: testl $43690, %eax # imm = 0xAAAA
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovmskps %ymm0, %eax
+; AVX1-NEXT: testb %al, %al
; AVX1-NEXT: setne %al
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -541,11 +544,9 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) {
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: testl $43690, %eax # imm = 0xAAAA
+; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT: vmovmskps %ymm0, %eax
+; AVX2-NEXT: testb %al, %al
; AVX2-NEXT: setne %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
index a60dbcd7480a..96b4d887eabf 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
@@ -215,16 +215,25 @@ define i1 @trunc_v4i64_v4i1(<4 x i64>) {
; SSE-NEXT: setnp %al
; SSE-NEXT: retq
;
-; AVX-LABEL: trunc_v4i64_v4i1:
-; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX-NEXT: vmovmskps %xmm0, %eax
-; AVX-NEXT: testb %al, %al
-; AVX-NEXT: setnp %al
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: trunc_v4i64_v4i1:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vmovmskps %xmm0, %eax
+; AVX1-NEXT: testb %al, %al
+; AVX1-NEXT: setnp %al
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_v4i64_v4i1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0
+; AVX2-NEXT: vmovmskpd %ymm0, %eax
+; AVX2-NEXT: testb %al, %al
+; AVX2-NEXT: setnp %al
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_v4i1:
; AVX512F: # %bb.0:
@@ -290,14 +299,11 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) {
;
; AVX1-LABEL: trunc_v8i32_v8i1:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovmskps %ymm0, %eax
; AVX1-NEXT: testb %al, %al
; AVX1-NEXT: setnp %al
; AVX1-NEXT: vzeroupper
@@ -305,11 +311,8 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) {
;
; AVX2-LABEL: trunc_v8i32_v8i1:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT: vmovmskps %ymm0, %eax
; AVX2-NEXT: testb %al, %al
; AVX2-NEXT: setnp %al
; AVX2-NEXT: vzeroupper
@@ -548,17 +551,14 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) {
;
; AVX1-LABEL: trunc_v8i64_v8i1:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovmskps %ymm0, %eax
; AVX1-NEXT: testb %al, %al
; AVX1-NEXT: setnp %al
; AVX1-NEXT: vzeroupper
@@ -569,11 +569,8 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) {
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT: vmovmskps %ymm0, %eax
; AVX2-NEXT: testb %al, %al
; AVX2-NEXT: setnp %al
; AVX2-NEXT: vzeroupper
More information about the llvm-commits
mailing list