[llvm] 2830363 - [X86][SSE] combineVectorTruncation - enable (pre-SSSE3) vXi16->vXi8 truncation.
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 24 07:51:52 PDT 2020
Author: Simon Pilgrim
Date: 2020-09-24T15:51:36+01:00
New Revision: 283036394ee606401ebb6601bb97b500e11a2830
URL: https://github.com/llvm/llvm-project/commit/283036394ee606401ebb6601bb97b500e11a2830
DIFF: https://github.com/llvm/llvm-project/commit/283036394ee606401ebb6601bb97b500e11a2830.diff
LOG: [X86][SSE] combineVectorTruncation - enable (pre-SSSE3) vXi16->vXi8 truncation.
Shuffle combining can now handle this output, and by performing this early in combineVectorTruncation we avoid a scalarization that caused a regression on D87502.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/masked_store_trunc.ll
llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
llvm/test/CodeGen/X86/vector-trunc-math.ll
llvm/test/CodeGen/X86/vector-trunc.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ceba7f724c99..0388cb72a76c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45378,7 +45378,7 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
EVT OutSVT = OutVT.getVectorElementType();
EVT InSVT = InVT.getVectorElementType();
- if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
+ if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
(OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
NumElems >= 8))
return SDValue();
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll
index b806c2c07695..1ba4e02846c4 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll
@@ -4431,7 +4431,7 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, <32 x i8>* %p, <32 x i8> %ma
; SSE2-LABEL: truncstore_v32i16_v32i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE2-NEXT: pand %xmm6, %xmm1
; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: packuswb %xmm1, %xmm0
@@ -4648,7 +4648,7 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, <32 x i8>* %p, <32 x i8> %ma
; SSE4-LABEL: truncstore_v32i16_v32i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm7, %xmm7
-; SSE4-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
+; SSE4-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE4-NEXT: pand %xmm6, %xmm1
; SSE4-NEXT: pand %xmm6, %xmm0
; SSE4-NEXT: packuswb %xmm1, %xmm0
@@ -5634,7 +5634,7 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, <16 x i8>* %p, <16 x i8> %ma
; SSE2-LABEL: truncstore_v16i16_v16i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: packuswb %xmm1, %xmm0
@@ -5746,7 +5746,7 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, <16 x i8>* %p, <16 x i8> %ma
; SSE4-LABEL: truncstore_v16i16_v16i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm3, %xmm3
-; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE4-NEXT: pand %xmm4, %xmm1
; SSE4-NEXT: pand %xmm4, %xmm0
; SSE4-NEXT: packuswb %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
index 30c1b4d9994a..3c5e46ec9643 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
@@ -358,7 +358,7 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) {
define i1 @trunc_v16i16_v16i1(<16 x i16>) {
; SSE-LABEL: trunc_v16i16_v16i1:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
index 5dc9b7033d82..a497c83d2785 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
@@ -345,7 +345,7 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) {
define i1 @trunc_v16i16_v16i1(<16 x i16>) {
; SSE-LABEL: trunc_v16i16_v16i1:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
index 06a428c514a7..5b0109bcd4eb 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
@@ -352,7 +352,7 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) {
define i1 @trunc_v16i16_v16i1(<16 x i16>) {
; SSE-LABEL: trunc_v16i16_v16i1:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll
index a71c9497879a..f245fc08b9c6 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-math.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll
@@ -382,7 +382,7 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; SSE: # %bb.0:
; SSE-NEXT: paddw %xmm2, %xmm0
; SSE-NEXT: paddw %xmm3, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
@@ -798,7 +798,7 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE-LABEL: trunc_add_const_v16i16_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
@@ -1226,7 +1226,7 @@ define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; SSE: # %bb.0:
; SSE-NEXT: psubw %xmm2, %xmm0
; SSE-NEXT: psubw %xmm3, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
@@ -1610,7 +1610,7 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE-LABEL: trunc_sub_const_v16i16_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
@@ -2201,7 +2201,7 @@ define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; SSE: # %bb.0:
; SSE-NEXT: pmullw %xmm2, %xmm0
; SSE-NEXT: pmullw %xmm3, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
@@ -2706,7 +2706,7 @@ define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE: # %bb.0:
; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
@@ -3106,7 +3106,7 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
; SSE-LABEL: trunc_and_v16i16_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE-NEXT: pand %xmm4, %xmm3
; SSE-NEXT: pand %xmm1, %xmm3
; SSE-NEXT: pand %xmm4, %xmm2
@@ -3471,7 +3471,7 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE-LABEL: trunc_and_const_v16i16_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
@@ -3871,7 +3871,7 @@ define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: pxor %xmm3, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
@@ -4234,7 +4234,7 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE-LABEL: trunc_xor_const_v16i16_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
@@ -4634,7 +4634,7 @@ define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind
; SSE: # %bb.0:
; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
@@ -4997,7 +4997,7 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE-LABEL: trunc_or_const_v16i16_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index b6d1b0d16f7a..bd8b7dd355cc 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -1026,7 +1026,7 @@ entry:
define void @trunc16i16_16i8(<16 x i16> %a) {
; SSE-LABEL: trunc16i16_16i8:
; SSE: # %bb.0: # %entry
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
@@ -1217,7 +1217,7 @@ entry:
define void @trunc32i16_32i8(<32 x i16> %a) {
; SSE-LABEL: trunc32i16_32i8:
; SSE: # %bb.0: # %entry
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE-NEXT: pand %xmm4, %xmm1
; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
@@ -1675,13 +1675,29 @@ entry:
}
define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
-; SSE-LABEL: trunc2x8i16_16i8:
-; SSE: # %bb.0: # %entry
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: trunc2x8i16_16i8:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: trunc2x8i16_16i8:
+; SSSE3: # %bb.0: # %entry
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: packuswb %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: trunc2x8i16_16i8:
+; SSE41: # %bb.0: # %entry
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm2, %xmm1
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: packuswb %xmm1, %xmm0
+; SSE41-NEXT: retq
;
; AVX-LABEL: trunc2x8i16_16i8:
; AVX: # %bb.0: # %entry
More information about the llvm-commits
mailing list