[llvm] 6a2c226 - [X86] Improve i8 all-ones element insertion in pre-SSE4.1
Roman Lebedev via llvm-commits
llvm-commits at lists.llvm.org
Sat Sep 18 12:24:19 PDT 2021
Author: Roman Lebedev
Date: 2021-09-18T22:24:06+03:00
New Revision: 6a2c2263fbca07a59b9f41181c1418df052e24d1
URL: https://github.com/llvm/llvm-project/commit/6a2c2263fbca07a59b9f41181c1418df052e24d1
DIFF: https://github.com/llvm/llvm-project/commit/6a2c2263fbca07a59b9f41181c1418df052e24d1.diff
LOG: [X86] Improve i8 all-ones element insertion in pre-SSE4.1
Should avoid some regressions in D109065
Reviewed By: RKSimon
Differential Revision: https://reviews.llvm.org/D109989
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/insertelement-ones.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 09ba7af6e38ad..82b6a20a39c05 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -969,6 +969,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
@@ -1175,10 +1176,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
}
- // i8 vectors are custom because the source register and source
- // source memory operand types are not the same width.
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
-
if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
// We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
// do the pre and post work in the vector domain.
@@ -19310,17 +19307,28 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
bool IsZeroElt = X86::isZeroNode(N1);
bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
- // If we are inserting a element, see if we can do this more efficiently with
- // a blend shuffle with a rematerializable vector than a costly integer
- // insertion.
- if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
- (16 <= EltSizeInBits || (IsZeroElt && !VT.is128BitVector()))) {
- SmallVector<int, 8> BlendMask;
- for (unsigned i = 0; i != NumElts; ++i)
- BlendMask.push_back(i == IdxVal ? i + NumElts : i);
- SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
- : getOnesVector(VT, DAG, dl);
- return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
+ if (IsZeroElt || IsAllOnesElt) {
+ // Lower insertion of i8 -1 as an 'OR' blend.
+ // We don't deal with i8 0 since it appears to be handled elsewhere.
+ if (IsAllOnesElt && EltSizeInBits == 8 && !Subtarget.hasSSE41()) {
+ SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
+ SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
+ SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
+ CstVectorElts[IdxVal] = OnesCst;
+ SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
+ return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
+ }
+ // See if we can do this more efficiently with a blend shuffle with a
+ // rematerializable vector.
+ if (Subtarget.hasSSE41() &&
+ (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
+ SmallVector<int, 8> BlendMask;
+ for (unsigned i = 0; i != NumElts; ++i)
+ BlendMask.push_back(i == IdxVal ? i + NumElts : i);
+ SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
+ : getOnesVector(VT, DAG, dl);
+ return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
+ }
}
// If the vector is wider than 128 bits, extract the 128-bit subvector, insert
diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll
index 04c02f04f816a..577b662393628 100644
--- a/llvm/test/CodeGen/X86/insertelement-ones.ll
+++ b/llvm/test/CodeGen/X86/insertelement-ones.ll
@@ -311,36 +311,20 @@ define <16 x i16> @insert_v16i16_x12345x789ABCDEx(<16 x i16> %a) {
define <16 x i8> @insert_v16i8_x123456789ABCDEx(<16 x i8> %a) {
; SSE2-LABEL: insert_v16i8_x123456789ABCDEx:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; SSE2-NEXT: movl $255, %eax
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: pandn %xmm2, %xmm1
-; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v16i8_x123456789ABCDEx:
; SSE3: # %bb.0:
-; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; SSE3-NEXT: movl $255, %eax
-; SSE3-NEXT: movd %eax, %xmm2
-; SSE3-NEXT: pandn %xmm2, %xmm1
-; SSE3-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
-; SSE3-NEXT: por %xmm1, %xmm2
-; SSE3-NEXT: por %xmm2, %xmm0
+; SSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_v16i8_x123456789ABCDEx:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movl $255, %eax
-; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
-; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
-; SSSE3-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; SSSE3-NEXT: por %xmm2, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_v16i8_x123456789ABCDEx:
@@ -364,48 +348,29 @@ define <16 x i8> @insert_v16i8_x123456789ABCDEx(<16 x i8> %a) {
define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) {
; SSE2-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; SSE2-NEXT: movl $255, %eax
-; SSE2-NEXT: movd %eax, %xmm3
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movaps {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255]
+; SSE2-NEXT: orps %xmm2, %xmm0
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: orps %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
; SSE3: # %bb.0:
-; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; SSE3-NEXT: movl $255, %eax
-; SSE3-NEXT: movd %eax, %xmm3
-; SSE3-NEXT: pandn %xmm3, %xmm2
-; SSE3-NEXT: movdqa %xmm3, %xmm4
-; SSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
-; SSE3-NEXT: por %xmm4, %xmm2
-; SSE3-NEXT: por %xmm2, %xmm0
-; SSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
-; SSE3-NEXT: por %xmm4, %xmm3
-; SSE3-NEXT: por %xmm3, %xmm1
+; SSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: movaps {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255]
+; SSE3-NEXT: orps %xmm2, %xmm0
+; SSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE3-NEXT: orps %xmm2, %xmm1
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movl $255, %eax
-; SSSE3-NEXT: movd %eax, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
-; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; SSSE3-NEXT: por %xmm0, %xmm2
-; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0],zero
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: por %xmm3, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT: movaps {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255]
+; SSSE3-NEXT: orps %xmm2, %xmm0
+; SSSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT: orps %xmm2, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
More information about the llvm-commits
mailing list