[llvm] c76d6dd - [ARM] Generate VCTP from SETCC
David Green via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 26 02:57:20 PST 2021
Author: David Green
Date: 2021-11-26T10:57:14Z
New Revision: c76d6dd192648fbb1a55eb8378850a654b673832
URL: https://github.com/llvm/llvm-project/commit/c76d6dd192648fbb1a55eb8378850a654b673832
DIFF: https://github.com/llvm/llvm-project/commit/c76d6dd192648fbb1a55eb8378850a654b673832.diff
LOG: [ARM] Generate VCTP from SETCC
This converts a vector SETCC([0,1,2,..], splat(n), ult) to vctp n, which
can be fewer instructions and prevent the need for constant pool loads.
Differential Revision: https://reviews.llvm.org/D114177
Added:
Modified:
llvm/lib/Target/ARM/ARMISelLowering.cpp
llvm/test/CodeGen/Thumb2/mve-vctp.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 5f5e76b9558a5..d87ef2babf164 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1016,6 +1016,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::FP_EXTEND);
setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::SELECT_CC);
+ setTargetDAGCombine(ISD::SETCC);
}
if (Subtarget->hasMVEFloatOps()) {
setTargetDAGCombine(ISD::FADD);
@@ -13082,6 +13083,65 @@ static SDValue PerformVSELECTCombine(SDNode *N,
return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
}
+// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n
+static SDValue PerformVSetCCToVCTPCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ EVT VT = N->getValueType(0);
+
+ if (!Subtarget->hasMVEIntegerOps() ||
+ !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ if (CC == ISD::SETUGE) {
+ std::swap(Op0, Op1);
+ CC = ISD::SETULT;
+ }
+
+ if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||
+ Op0.getOpcode() != ISD::BUILD_VECTOR)
+ return SDValue();
+
+ // Check first operand is BuildVector of 0,1,2,...
+ for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {
+ if (!Op0.getOperand(I).isUndef() &&
+ !(isa<ConstantSDNode>(Op0.getOperand(I)) &&
+ Op0.getConstantOperandVal(I) == I))
+ return SDValue();
+ }
+
+ // The second is a Splat of Op1S
+ SDValue Op1S = DCI.DAG.getSplatValue(Op1);
+ if (!Op1S)
+ return SDValue();
+
+ unsigned Opc;
+ switch (VT.getVectorNumElements()) {
+ case 2:
+ Opc = Intrinsic::arm_mve_vctp64;
+ break;
+ case 4:
+ Opc = Intrinsic::arm_mve_vctp32;
+ break;
+ case 8:
+ Opc = Intrinsic::arm_mve_vctp16;
+ break;
+ case 16:
+ Opc = Intrinsic::arm_mve_vctp8;
+ break;
+ default:
+ return SDValue();
+ }
+
+ SDLoc DL(N);
+ return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DCI.DAG.getConstant(Opc, DL, MVT::i32),
+ DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
+}
+
static SDValue PerformABSCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
@@ -18196,6 +18256,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SELECT_CC:
case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
+ case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget);
case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
diff --git a/llvm/test/CodeGen/Thumb2/mve-vctp.ll b/llvm/test/CodeGen/Thumb2/mve-vctp.ll
index a91d291be4336..e7eddc19e9699 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vctp.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vctp.ll
@@ -56,19 +56,11 @@ define void @vctp32(i32 %arg, <4 x i32> *%in, <4 x i32>* %out) {
define arm_aapcs_vfpcc <4 x i32> @vcmp_ult_v4i32(i32 %n, <4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: vcmp_ult_v4i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vdup.32 q2, r0
-; CHECK-NEXT: adr r0, .LCPI3_0
-; CHECK-NEXT: vldrw.u32 q3, [r0]
-; CHECK-NEXT: vcmp.u32 hi, q2, q3
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: vctp.32 r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
-; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: @ %bb.1:
-; CHECK-NEXT: .LCPI3_0:
-; CHECK-NEXT: .long 0 @ 0x0
-; CHECK-NEXT: .long 1 @ 0x1
-; CHECK-NEXT: .long 2 @ 0x2
-; CHECK-NEXT: .long 3 @ 0x3
entry:
%i = insertelement <4 x i32> undef, i32 %n, i32 0
%ns = shufflevector <4 x i32> %i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -80,19 +72,11 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @vcmp_uge_v4i32(i32 %n, <4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: vcmp_uge_v4i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vdup.32 q2, r0
-; CHECK-NEXT: adr r0, .LCPI4_0
-; CHECK-NEXT: vldrw.u32 q3, [r0]
-; CHECK-NEXT: vcmp.u32 cs, q2, q3
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: vctp.32 r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
-; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: @ %bb.1:
-; CHECK-NEXT: .LCPI4_0:
-; CHECK-NEXT: .long 0 @ 0x0
-; CHECK-NEXT: .long 1 @ 0x1
-; CHECK-NEXT: .long 2 @ 0x2
-; CHECK-NEXT: .long 3 @ 0x3
entry:
%i = insertelement <4 x i32> undef, i32 %n, i32 0
%ns = shufflevector <4 x i32> %i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -104,19 +88,11 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @vcmp_ult_v4i32_undef(i32 %n, <4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: vcmp_ult_v4i32_undef:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vdup.32 q2, r0
-; CHECK-NEXT: adr r0, .LCPI5_0
-; CHECK-NEXT: vldrw.u32 q3, [r0]
-; CHECK-NEXT: vcmp.u32 hi, q2, q3
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: vctp.32 r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
-; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: @ %bb.1:
-; CHECK-NEXT: .LCPI5_0:
-; CHECK-NEXT: .long 0 @ 0x0
-; CHECK-NEXT: .long 1 @ 0x1
-; CHECK-NEXT: .zero 4
-; CHECK-NEXT: .zero 4
entry:
%i = insertelement <4 x i32> undef, i32 %n, i32 0
%ns = shufflevector <4 x i32> %i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -129,23 +105,12 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vcmp_ult_v8i16(i16 %n, <8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: vcmp_ult_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vdup.16 q2, r0
-; CHECK-NEXT: adr r0, .LCPI6_0
-; CHECK-NEXT: vldrw.u32 q3, [r0]
-; CHECK-NEXT: vcmp.u16 hi, q2, q3
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: uxth r0, r0
+; CHECK-NEXT: vctp.16 r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
-; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: @ %bb.1:
-; CHECK-NEXT: .LCPI6_0:
-; CHECK-NEXT: .short 0 @ 0x0
-; CHECK-NEXT: .short 1 @ 0x1
-; CHECK-NEXT: .short 2 @ 0x2
-; CHECK-NEXT: .short 3 @ 0x3
-; CHECK-NEXT: .short 4 @ 0x4
-; CHECK-NEXT: .short 5 @ 0x5
-; CHECK-NEXT: .short 6 @ 0x6
-; CHECK-NEXT: .short 7 @ 0x7
entry:
%i = insertelement <8 x i16> undef, i16 %n, i32 0
%ns = shufflevector <8 x i16> %i, <8 x i16> undef, <8 x i32> zeroinitializer
@@ -157,23 +122,12 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vcmp_uge_v8i16(i16 %n, <8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: vcmp_uge_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vdup.16 q2, r0
-; CHECK-NEXT: adr r0, .LCPI7_0
-; CHECK-NEXT: vldrw.u32 q3, [r0]
-; CHECK-NEXT: vcmp.u16 cs, q2, q3
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: uxth r0, r0
+; CHECK-NEXT: vctp.16 r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
-; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: @ %bb.1:
-; CHECK-NEXT: .LCPI7_0:
-; CHECK-NEXT: .short 0 @ 0x0
-; CHECK-NEXT: .short 1 @ 0x1
-; CHECK-NEXT: .short 2 @ 0x2
-; CHECK-NEXT: .short 3 @ 0x3
-; CHECK-NEXT: .short 4 @ 0x4
-; CHECK-NEXT: .short 5 @ 0x5
-; CHECK-NEXT: .short 6 @ 0x6
-; CHECK-NEXT: .short 7 @ 0x7
entry:
%i = insertelement <8 x i16> undef, i16 %n, i32 0
%ns = shufflevector <8 x i16> %i, <8 x i16> undef, <8 x i32> zeroinitializer
@@ -186,31 +140,12 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @vcmp_ult_v16i8(i8 %n, <16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: vcmp_ult_v16i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vdup.8 q2, r0
-; CHECK-NEXT: adr r0, .LCPI8_0
-; CHECK-NEXT: vldrw.u32 q3, [r0]
-; CHECK-NEXT: vcmp.u8 hi, q2, q3
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: uxtb r0, r0
+; CHECK-NEXT: vctp.8 r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
-; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: @ %bb.1:
-; CHECK-NEXT: .LCPI8_0:
-; CHECK-NEXT: .byte 0 @ 0x0
-; CHECK-NEXT: .byte 1 @ 0x1
-; CHECK-NEXT: .byte 2 @ 0x2
-; CHECK-NEXT: .byte 3 @ 0x3
-; CHECK-NEXT: .byte 4 @ 0x4
-; CHECK-NEXT: .byte 5 @ 0x5
-; CHECK-NEXT: .byte 6 @ 0x6
-; CHECK-NEXT: .byte 7 @ 0x7
-; CHECK-NEXT: .byte 8 @ 0x8
-; CHECK-NEXT: .byte 9 @ 0x9
-; CHECK-NEXT: .byte 10 @ 0xa
-; CHECK-NEXT: .byte 11 @ 0xb
-; CHECK-NEXT: .byte 12 @ 0xc
-; CHECK-NEXT: .byte 13 @ 0xd
-; CHECK-NEXT: .byte 14 @ 0xe
-; CHECK-NEXT: .byte 15 @ 0xf
entry:
%i = insertelement <16 x i8> undef, i8 %n, i32 0
%ns = shufflevector <16 x i8> %i, <16 x i8> undef, <16 x i32> zeroinitializer
@@ -222,31 +157,12 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @vcmp_uge_v16i8(i8 %n, <16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: vcmp_uge_v16i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vdup.8 q2, r0
-; CHECK-NEXT: adr r0, .LCPI9_0
-; CHECK-NEXT: vldrw.u32 q3, [r0]
-; CHECK-NEXT: vcmp.u8 cs, q2, q3
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: uxtb r0, r0
+; CHECK-NEXT: vctp.8 r0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
-; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: @ %bb.1:
-; CHECK-NEXT: .LCPI9_0:
-; CHECK-NEXT: .byte 0 @ 0x0
-; CHECK-NEXT: .byte 1 @ 0x1
-; CHECK-NEXT: .byte 2 @ 0x2
-; CHECK-NEXT: .byte 3 @ 0x3
-; CHECK-NEXT: .byte 4 @ 0x4
-; CHECK-NEXT: .byte 5 @ 0x5
-; CHECK-NEXT: .byte 6 @ 0x6
-; CHECK-NEXT: .byte 7 @ 0x7
-; CHECK-NEXT: .byte 8 @ 0x8
-; CHECK-NEXT: .byte 9 @ 0x9
-; CHECK-NEXT: .byte 10 @ 0xa
-; CHECK-NEXT: .byte 11 @ 0xb
-; CHECK-NEXT: .byte 12 @ 0xc
-; CHECK-NEXT: .byte 13 @ 0xd
-; CHECK-NEXT: .byte 14 @ 0xe
-; CHECK-NEXT: .byte 15 @ 0xf
entry:
%i = insertelement <16 x i8> undef, i8 %n, i32 0
%ns = shufflevector <16 x i8> %i, <16 x i8> undef, <16 x i32> zeroinitializer
More information about the llvm-commits
mailing list