[llvm] 2f2dcb4 - [AArch64][SVE] Invert VSelect operand order and condition for predicated arithmetic operations
Matt Devereau via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 17 08:01:47 PST 2022
Author: Matt Devereau
Date: 2022-02-17T16:01:17Z
New Revision: 2f2dcb4fb134a7e06d99ef62ca512c8307187207
URL: https://github.com/llvm/llvm-project/commit/2f2dcb4fb134a7e06d99ef62ca512c8307187207
DIFF: https://github.com/llvm/llvm-project/commit/2f2dcb4fb134a7e06d99ef62ca512c8307187207.diff
LOG: [AArch64][SVE] Invert VSelect operand order and condition for predicated arithmetic operations
(vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
=> (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
As a follow up to D117689, invert the operand order and condition
in order to fold vselects into predicated instructions.
Differential Revision: https://reviews.llvm.org/D119424
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll
llvm/test/CodeGen/AArch64/sve-select.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6e763202ce917..810cdb748b3bf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17152,12 +17152,51 @@ static SDValue performTBZCombine(SDNode *N,
DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
}
+// Swap vselect operands where it may allow a predicated operation to achieve
+// the `sel`.
+//
+// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
+// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
+static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) {
+ auto SelectA = N->getOperand(1);
+ auto SelectB = N->getOperand(2);
+ auto NTy = N->getValueType(0);
+
+ if (!NTy.isScalableVector())
+ return SDValue();
+ SDValue SetCC = N->getOperand(0);
+ if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
+ return SDValue();
+
+ switch (SelectB.getOpcode()) {
+ default:
+ return SDValue();
+ case ISD::FMUL:
+ case ISD::FSUB:
+ case ISD::FADD:
+ break;
+ }
+ if (SelectA != SelectB.getOperand(0))
+ return SDValue();
+
+ ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
+ auto InverseSetCC = DAG.getSetCC(
+ SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
+ SetCC.getOperand(1), ISD::getSetCCInverse(CC, SetCC.getValueType()));
+
+ return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
+ {InverseSetCC, SelectB, SelectA});
+}
+
// vselect (v1i1 setcc) ->
// vselect (v1iXX setcc) (XX is the size of the compared operand type)
// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
// such VSELECT.
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
+ if (auto SwapResult = trySwapVSelectOperands(N, DAG))
+ return SwapResult;
+
SDValue N0 = N->getOperand(0);
EVT CCVT = N0.getValueType();
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll b/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll
index 2385436cfe587..1b2e0be6111c0 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll
@@ -95,14 +95,13 @@ define <vscale x 8 x half> @fsqrt_recip_8f16(<vscale x 8 x half> %a) #0 {
; CHECK-NEXT: frsqrte z1.h, z0.h
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: fmul z2.h, z1.h, z1.h
-; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT: fcmne p0.h, p0/z, z0.h, #0.0
; CHECK-NEXT: frsqrts z2.h, z0.h, z2.h
; CHECK-NEXT: fmul z1.h, z1.h, z2.h
; CHECK-NEXT: fmul z2.h, z1.h, z1.h
; CHECK-NEXT: frsqrts z2.h, z0.h, z2.h
; CHECK-NEXT: fmul z1.h, z1.h, z2.h
-; CHECK-NEXT: fmul z1.h, z0.h, z1.h
-; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h
; CHECK-NEXT: ret
%fsqrt = call fast <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> %a)
ret <vscale x 8 x half> %fsqrt
@@ -124,14 +123,13 @@ define <vscale x 4 x float> @fsqrt_recip_4f32(<vscale x 4 x float> %a) #0 {
; CHECK-NEXT: frsqrte z1.s, z0.s
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: fmul z2.s, z1.s, z1.s
-; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0
; CHECK-NEXT: frsqrts z2.s, z0.s, z2.s
; CHECK-NEXT: fmul z1.s, z1.s, z2.s
; CHECK-NEXT: fmul z2.s, z1.s, z1.s
; CHECK-NEXT: frsqrts z2.s, z0.s, z2.s
; CHECK-NEXT: fmul z1.s, z1.s, z2.s
-; CHECK-NEXT: fmul z1.s, z0.s, z1.s
-; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: ret
%fsqrt = call fast <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> %a)
ret <vscale x 4 x float> %fsqrt
@@ -153,7 +151,7 @@ define <vscale x 2 x double> @fsqrt_recip_2f64(<vscale x 2 x double> %a) #0 {
; CHECK-NEXT: frsqrte z1.d, z0.d
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: fmul z2.d, z1.d, z1.d
-; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, #0.0
+; CHECK-NEXT: fcmne p0.d, p0/z, z0.d, #0.0
; CHECK-NEXT: frsqrts z2.d, z0.d, z2.d
; CHECK-NEXT: fmul z1.d, z1.d, z2.d
; CHECK-NEXT: fmul z2.d, z1.d, z1.d
@@ -162,8 +160,7 @@ define <vscale x 2 x double> @fsqrt_recip_2f64(<vscale x 2 x double> %a) #0 {
; CHECK-NEXT: fmul z2.d, z1.d, z1.d
; CHECK-NEXT: frsqrts z2.d, z0.d, z2.d
; CHECK-NEXT: fmul z1.d, z1.d, z2.d
-; CHECK-NEXT: fmul z1.d, z0.d, z1.d
-; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: ret
%fsqrt = call fast <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> %a)
ret <vscale x 2 x double> %fsqrt
diff --git a/llvm/test/CodeGen/AArch64/sve-select.ll b/llvm/test/CodeGen/AArch64/sve-select.ll
index 819620a299f00..3183e1e54f081 100644
--- a/llvm/test/CodeGen/AArch64/sve-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-select.ll
@@ -542,3 +542,111 @@ define <vscale x 16 x i1> @icmp_select_nxv16i1(<vscale x 16 x i1> %a, <vscale x
%sel = select i1 %mask, <vscale x 16 x i1> %a, <vscale x 16 x i1> %b
ret <vscale x 16 x i1> %sel
}
+
+define <vscale x 4 x float> @select_f32_invert_fmul(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: select_f32_invert_fmul:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+ %p = fcmp oeq <vscale x 4 x float> %a, zeroinitializer
+ %fmul = fmul <vscale x 4 x float> %a, %b
+ %sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> %a, <vscale x 4 x float> %fmul
+ ret <vscale x 4 x float> %sel
+}
+
+define <vscale x 4 x float> @select_f32_invert_fadd(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: select_f32_invert_fadd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+ %p = fcmp oeq <vscale x 4 x float> %a, zeroinitializer
+ %fadd = fadd <vscale x 4 x float> %a, %b
+ %sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> %a, <vscale x 4 x float> %fadd
+ ret <vscale x 4 x float> %sel
+}
+
+define <vscale x 4 x float> @select_f32_invert_fsub(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: select_f32_invert_fsub:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+ %p = fcmp oeq <vscale x 4 x float> %a, zeroinitializer
+ %fsub = fsub <vscale x 4 x float> %a, %b
+ %sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> %a, <vscale x 4 x float> %fsub
+ ret <vscale x 4 x float> %sel
+}
+
+define <vscale x 4 x float> @select_f32_no_invert_op_lhs(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: select_f32_no_invert_op_lhs:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+ %p = fcmp oeq <vscale x 4 x float> %a, zeroinitializer
+ %fmul = fmul <vscale x 4 x float> %a, %b
+ %sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> %fmul, <vscale x 4 x float> %a
+ ret <vscale x 4 x float> %sel
+}
+
+define <vscale x 4 x float> @select_f32_no_invert_2_op(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, <vscale x 4 x float> %d) {
+; CHECK-LABEL: select_f32_no_invert_2_op:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmul z2.s, z2.s, z3.s
+; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: fmul z0.s, z0.s, z1.s
+; CHECK-NEXT: sel z0.s, p0, z0.s, z2.s
+; CHECK-NEXT: ret
+ %p = fcmp oeq <vscale x 4 x float> %a, zeroinitializer
+ %fmul1 = fmul <vscale x 4 x float> %a, %b
+ %fmul2 = fmul <vscale x 4 x float> %c, %d
+ %sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> %fmul1, <vscale x 4 x float> %fmul2
+ ret <vscale x 4 x float> %sel
+}
+
+define <vscale x 4 x float> @select_f32_no_invert_equal_ops(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: select_f32_no_invert_equal_ops:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmul z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+ %m = fmul <vscale x 4 x float> %a, %b
+ %p = fcmp oeq <vscale x 4 x float> %m, zeroinitializer
+ %sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> %m, <vscale x 4 x float> %m
+ ret <vscale x 4 x float> %sel
+}
+
+define <vscale x 4 x float> @select_f32_no_invert_fmul_two_setcc_uses(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, i32 %len) #0 {
+; CHECK-LABEL: select_f32_no_invert_fmul_two_setcc_uses:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fadd z1.s, z0.s, z1.s
+; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT: mov z0.s, p0/m, z2.s
+; CHECK-NEXT: ret
+ %p = fcmp oeq <vscale x 4 x float> %a, zeroinitializer
+ %fadd = fadd <vscale x 4 x float> %a, %b
+ %sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> %a, <vscale x 4 x float> %fadd
+ %sel2 = select <vscale x 4 x i1> %p, <vscale x 4 x float> %c, <vscale x 4 x float> %sel
+ ret <vscale x 4 x float> %sel2
+}
+
+define <4 x float> @select_f32_no_invert_not_scalable(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: select_f32_no_invert_not_scalable:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcmeq v2.4s, v0.4s, #0.0
+; CHECK-NEXT: fmul v1.4s, v0.4s, v1.4s
+; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: ret
+ %p = fcmp oeq <4 x float> %a, zeroinitializer
+ %fmul = fmul <4 x float> %a, %b
+ %sel = select <4 x i1> %p, <4 x float> %a, <4 x float> %fmul
+ ret <4 x float> %sel
+}
More information about the llvm-commits
mailing list