[llvm] b52962d - [X86] LowerVSELECT - split v16i16/v32i8 pre-AVX2 VSELECT ops if enough of the operands are free to split.
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri May 31 06:43:22 PDT 2024
Author: Simon Pilgrim
Date: 2024-05-31T14:43:10+01:00
New Revision: b52962d1b89ca9102a89497743b7576d572b437e
URL: https://github.com/llvm/llvm-project/commit/b52962d1b89ca9102a89497743b7576d572b437e
DIFF: https://github.com/llvm/llvm-project/commit/b52962d1b89ca9102a89497743b7576d572b437e.diff
LOG: [X86] LowerVSELECT - split v16i16/v32i8 pre-AVX2 VSELECT ops if enough of the operands are free to split.
Often on AVX1 we're better off consistently using 128-bit instructions, so recognise when the operands are loads that can be freely/cheaply split - ideally this functionality needs to be moved to isFreeToSplitVector but we're using it in a few places where we don't want to split loads yet.
Based off a regression reported after #92794
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/vselect-pcmp.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 839006cbaed4c..7b9e6c0a00273 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -17846,6 +17846,22 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
}
+ // v16i16/v32i8 selects without AVX2, if the condition and another operand
+ // are free to split, then better to split before expanding the
+ // select. Don't bother with XOP as it has the fast VPCMOV instruction.
+ // TODO: This is very similar to narrowVectorSelect.
+ // TODO: Add Load splitting to isFreeToSplitVector ?
+ if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
+ !Subtarget.hasXOP()) {
+ bool FreeCond = isFreeToSplitVector(Cond.getNode(), DAG);
+ bool FreeLHS = isFreeToSplitVector(LHS.getNode(), DAG) ||
+ (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
+ bool FreeRHS = isFreeToSplitVector(RHS.getNode(), DAG) ||
+ (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
+ if (FreeCond && (FreeLHS || FreeRHS))
+ return splitVectorOp(Op, DAG, dl);
+ }
+
// Only some types will be legal on some subtargets. If we can emit a legal
// VSELECT-matching blend, return Op, and but if we need to expand, return
// a null value.
diff --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll
index f976222ac3b37..84317ad34fb29 100644
--- a/llvm/test/CodeGen/X86/vselect-pcmp.ll
+++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll
@@ -1509,16 +1509,16 @@ define void @store_blend_load_v16i16(ptr %a0, ptr %a1, ptr %a2) {
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8]
-; AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vandnps (%rsi), %ymm0, %ymm1
-; AVX1-NEXT: vandps (%rdi), %ymm0, %ymm0
-; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vmovaps %ymm0, (%rdx)
-; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm2
+; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2
+; AVX1-NEXT: vmovdqa (%rsi), %xmm4
+; AVX1-NEXT: vmovdqa 16(%rsi), %xmm5
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vpblendvb %xmm3, %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
; AVX1-NEXT: retq
;
; AVX2-LABEL: store_blend_load_v16i16:
@@ -1578,16 +1578,16 @@ define void @store_blend_load_v32i8(ptr %a0, ptr %a1, ptr %a2) {
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX1-NEXT: vpmaxub %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vandnps (%rsi), %ymm0, %ymm1
-; AVX1-NEXT: vandps (%rdi), %ymm0, %ymm0
-; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vmovaps %ymm0, (%rdx)
-; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpmaxub %xmm2, %xmm1, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX1-NEXT: vmovdqa (%rsi), %xmm4
+; AVX1-NEXT: vmovdqa 16(%rsi), %xmm5
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vpblendvb %xmm3, %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
; AVX1-NEXT: retq
;
; AVX2-LABEL: store_blend_load_v32i8:
More information about the llvm-commits
mailing list