[PATCH] D60838: [x86] use psubus for more vsetcc lowering (PR39859)
Sanjay Patel via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 17 14:52:23 PDT 2019
spatel created this revision.
spatel added reviewers: andreadb, craig.topper, RKSimon, nikic.
Herald added subscribers: hiraditya, mcrosier.
Herald added a project: LLVM.
Circling back to a leftover bit from PR39859:
https://bugs.llvm.org/show_bug.cgi?id=39859#c1
...we have this counter-intuitive (based on the test diffs) opportunity to use 'psubus'. This appears to be the better perf option for both Haswell and Jaguar based on llvm-mca. We already do this transform for the SETULT predicate, so this would make the code more symmetrical too. If we have pminub/pminuw, we prefer those, so this should not affect anything but pre-SSE4.1 subtargets.
$ cat before.s
movdqa -16(%rip), %xmm2 ## xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
pxor %xmm0, %xmm2
pcmpgtw -32(%rip), %xmm2 ## xmm2 = [255,255,255,255,255,255,255,255]
pand %xmm2, %xmm0
pandn %xmm1, %xmm2
por %xmm2, %xmm0
$ cat after.s
movdqa -16(%rip), %xmm2 ## xmm2 = [256,256,256,256,256,256,256,256]
psubusw %xmm0, %xmm2
pxor %xmm3, %xmm3
pcmpeqw %xmm2, %xmm3
pand %xmm3, %xmm0
pandn %xmm1, %xmm3
por %xmm3, %xmm0
$ llvm-mca before.s -mcpu=haswell
Iterations: 100
Instructions: 600
Total Cycles: 909
Total uOps: 700
Dispatch Width: 4
uOps Per Cycle: 0.77
IPC: 0.66
Block RThroughput: 1.8
$ llvm-mca after.s -mcpu=haswell
Iterations: 100
Instructions: 700
Total Cycles: 409
Total uOps: 700
Dispatch Width: 4
uOps Per Cycle: 1.71
IPC: 1.71
Block RThroughput: 1.8
https://reviews.llvm.org/D60838
Files:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/vec_setcc-2.ll
Index: llvm/test/CodeGen/X86/vec_setcc-2.ll
===================================================================
--- llvm/test/CodeGen/X86/vec_setcc-2.ll
+++ llvm/test/CodeGen/X86/vec_setcc-2.ll
@@ -194,8 +194,10 @@
define <8 x i1> @ugt_v8i16_splat(<8 x i16> %x) {
; SSE2-LABEL: ugt_v8i16_splat:
; SSE2: ## %bb.0:
-; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pcmpgtw {{.*}}(%rip), %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [243,243,243,243,243,243,243,243]
+; SSE2-NEXT: psubusw %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pcmpeqw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: ugt_v8i16_splat:
@@ -541,9 +543,10 @@
define <8 x i16> @PR39859(<8 x i16> %x, <8 x i16> %y) {
; SSE2-LABEL: PR39859:
; SSE2: ## %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtw {{.*}}(%rip), %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [43,43,43,43,43,43,43,43]
+; SSE2-NEXT: psubusw %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pcmpeqw %xmm3, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
Index: llvm/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -19745,6 +19745,34 @@
return DAG.getBuildVector(VT, DL, NewVecC);
}
+/// Given a simple buildvector constant, return a new vector constant with each
+/// element incremented. If incrementing would result in overflow or this
+/// is not a simple vector constant, return an empty value.
+static SDValue incrementVectorConstant(SDValue V, SelectionDAG &DAG) {
+ auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
+ if (!BV)
+ return SDValue();
+
+ MVT VT = V.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ unsigned NumElts = VT.getVectorNumElements();
+ SmallVector<SDValue, 8> NewVecC;
+ SDLoc DL(V);
+ for (unsigned i = 0; i < NumElts; ++i) {
+ auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
+ if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
+ return SDValue();
+
+ // Avoid overflow.
+ if (Elt->getAPIntValue().isMaxValue())
+ return SDValue();
+
+ NewVecC.push_back(DAG.getConstant(Elt->getAPIntValue() + 1, DL, EltVT));
+ }
+
+ return DAG.getBuildVector(VT, DL, NewVecC);
+}
+
/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
/// Op0 u<= Op1:
/// t = psubus Op0, Op1
@@ -19777,6 +19805,18 @@
Op1 = ULEOp1;
break;
}
+ case ISD::SETUGT: {
+ // If the comparison is against a constant, we can turn this into a setuge.
+ // This is beneficial because materializing a constant 0 for the PCMPEQ is
+ // probably cheaper than XOR+PCMPGT using 2 different vector constants:
+ // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
+ SDValue UGEOp1 = incrementVectorConstant(Op1, DAG);
+ if (!UGEOp1)
+ return SDValue();
+ Op1 = Op0;
+ Op0 = UGEOp1;
+ break;
+ }
// Psubus is better than flip-sign because it requires no inversion.
case ISD::SETUGE:
std::swap(Op0, Op1);
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D60838.195629.patch
Type: text/x-patch
Size: 3297 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20190417/49572884/attachment.bin>
More information about the llvm-commits
mailing list