[llvm] 264b1b2 - [ARM] Convert vector fdiv+fcvt fixed-point combine to fmul.
David Green via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 3 01:31:41 PDT 2024
Author: David Green
Date: 2024-06-03T09:31:36+01:00
New Revision: 264b1b24869eb45463a98d70e9b9e991092acc28
URL: https://github.com/llvm/llvm-project/commit/264b1b24869eb45463a98d70e9b9e991092acc28
DIFF: https://github.com/llvm/llvm-project/commit/264b1b24869eb45463a98d70e9b9e991092acc28.diff
LOG: [ARM] Convert vector fdiv+fcvt fixed-point combine to fmul.
Instcombine will convert fdiv by a power-2 to fmul, this converts the
PerformVDIVCombine that converts fdiv+fcvt to fixed-point fcvt to fmul+fcvt.
The fdiv tests will look worse, but won't appear in practice (and should be
improved again by #93882).
Added:
Modified:
llvm/lib/Target/ARM/ARMISelLowering.cpp
llvm/test/CodeGen/ARM/vdiv_combine.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 5090d8bf6cf22..5212d2c620b75 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1000,7 +1000,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
}
setTargetDAGCombine({ISD::SHL, ISD::SRL, ISD::SRA, ISD::FP_TO_SINT,
- ISD::FP_TO_UINT, ISD::FDIV, ISD::LOAD});
+ ISD::FP_TO_UINT, ISD::FMUL, ISD::LOAD});
// It is legal to extload from v4i8 to v4i16 or v4i32.
for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
@@ -17011,17 +17011,17 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
-/// can replace combinations of VCVT (integer to floating-point) and VDIV
-/// when the VDIV has a constant operand that is a power of 2.
+/// PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
+/// can replace combinations of VCVT (integer to floating-point) and VMUL
+/// when the VMUL has a constant operand that is a power of 2.
///
-/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
+/// Example (assume d17 = <float 0.125, float 0.125>):
/// vcvt.f32.s32 d16, d16
-/// vdiv.f32 d16, d17, d16
+/// vmul.f32 d16, d16, d17
/// becomes:
/// vcvt.f32.s32 d16, d16, #3
-static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
- const ARMSubtarget *Subtarget) {
+static SDValue PerformVMulVCTPCombine(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) {
if (!Subtarget->hasNEON())
return SDValue();
@@ -17048,26 +17048,34 @@ static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
- BitVector UndefElements;
- BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
- int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
+ ConstantFPSDNode *CN = isConstOrConstSplatFP(ConstVec, true);
+ APFloat Recip(0.0f);
+ if (!CN || !CN->getValueAPF().getExactInverse(&Recip))
+ return SDValue();
+
+ bool IsExact;
+ APSInt IntVal(33);
+ if (Recip.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
+ APFloat::opOK ||
+ !IsExact)
+ return SDValue();
+
+ int32_t C = IntVal.exactLogBase2();
if (C == -1 || C == 0 || C > 32)
return SDValue();
- SDLoc dl(N);
+ SDLoc DL(N);
bool isSigned = OpOpcode == ISD::SINT_TO_FP;
SDValue ConvInput = Op.getOperand(0);
if (IntBits < FloatBits)
- ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
- dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
- ConvInput);
+ ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
+ NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput);
- unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
- Intrinsic::arm_neon_vcvtfxu2fp;
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
- Op.getValueType(),
- DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
- ConvInput, DAG.getConstant(C, dl, MVT::i32));
+ unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp
+ : Intrinsic::arm_neon_vcvtfxu2fp;
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
+ DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
+ DAG.getConstant(C, DL, MVT::i32));
}
static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG,
@@ -18897,8 +18905,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
return PerformVCVTCombine(N, DCI.DAG, Subtarget);
case ISD::FADD:
return PerformFADDCombine(N, DCI.DAG, Subtarget);
- case ISD::FDIV:
- return PerformVDIVCombine(N, DCI.DAG, Subtarget);
+ case ISD::FMUL:
+ return PerformVMulVCTPCombine(N, DCI.DAG, Subtarget);
case ISD::INTRINSIC_WO_CHAIN:
return PerformIntrinsicCombine(N, DCI);
case ISD::SHL:
diff --git a/llvm/test/CodeGen/ARM/vdiv_combine.ll b/llvm/test/CodeGen/ARM/vdiv_combine.ll
index c8721e700a331..988844661085e 100644
--- a/llvm/test/CodeGen/ARM/vdiv_combine.ll
+++ b/llvm/test/CodeGen/ARM/vdiv_combine.ll
@@ -5,7 +5,10 @@
define arm_aapcs_vfpcc <2 x float> @t1(<2 x i32> %vecinit2.i) nounwind {
; CHECK-LABEL: t1:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vcvt.f32.s32 d0, d0, #3
+; CHECK-NEXT: vmov.f32 s2, #8.000000e+00
+; CHECK-NEXT: vcvt.f32.s32 d2, d0
+; CHECK-NEXT: vdiv.f32 s1, s5, s2
+; CHECK-NEXT: vdiv.f32 s0, s4, s2
; CHECK-NEXT: bx lr
entry:
%vcvt.i = sitofp <2 x i32> %vecinit2.i to <2 x float>
@@ -17,7 +20,10 @@ entry:
define arm_aapcs_vfpcc <2 x float> @t2(<2 x i32> %vecinit2.i) nounwind {
; CHECK-LABEL: t2:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vcvt.f32.u32 d0, d0, #3
+; CHECK-NEXT: vmov.f32 s2, #8.000000e+00
+; CHECK-NEXT: vcvt.f32.u32 d2, d0
+; CHECK-NEXT: vdiv.f32 s1, s5, s2
+; CHECK-NEXT: vdiv.f32 s0, s4, s2
; CHECK-NEXT: bx lr
entry:
%vcvt.i = uitofp <2 x i32> %vecinit2.i to <2 x float>
@@ -71,8 +77,17 @@ entry:
define arm_aapcs_vfpcc <2 x float> @t5(<2 x i32> %vecinit2.i) nounwind {
; CHECK-LABEL: t5:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vcvt.f32.s32 d0, d0, #32
+; CHECK-NEXT: vcvt.f32.s32 d2, d0
+; CHECK-NEXT: vldr s2, LCPI4_0
+; CHECK-NEXT: vdiv.f32 s1, s5, s2
+; CHECK-NEXT: vdiv.f32 s0, s4, s2
; CHECK-NEXT: bx lr
+; CHECK-NEXT: .p2align 2
+; CHECK-NEXT: @ %bb.1:
+; CHECK-NEXT: .data_region
+; CHECK-NEXT: LCPI4_0:
+; CHECK-NEXT: .long 0x4f800000 @ float 4.2949673E+9
+; CHECK-NEXT: .end_data_region
entry:
%vcvt.i = sitofp <2 x i32> %vecinit2.i to <2 x float>
%div.i = fdiv <2 x float> %vcvt.i, <float 0x41F0000000000000, float 0x41F0000000000000>
@@ -83,7 +98,12 @@ entry:
define arm_aapcs_vfpcc <4 x float> @t6(<4 x i32> %vecinit6.i) nounwind {
; CHECK-LABEL: t6:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vcvt.f32.s32 q0, q0, #3
+; CHECK-NEXT: vmov.f32 s4, #8.000000e+00
+; CHECK-NEXT: vcvt.f32.s32 q2, q0
+; CHECK-NEXT: vdiv.f32 s3, s11, s4
+; CHECK-NEXT: vdiv.f32 s2, s10, s4
+; CHECK-NEXT: vdiv.f32 s1, s9, s4
+; CHECK-NEXT: vdiv.f32 s0, s8, s4
; CHECK-NEXT: bx lr
entry:
%vcvt.i = sitofp <4 x i32> %vecinit6.i to <4 x float>
@@ -95,7 +115,12 @@ define arm_aapcs_vfpcc <4 x float> @fix_unsigned_i16_to_float(<4 x i16> %in) {
; CHECK-LABEL: fix_unsigned_i16_to_float:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmovl.u16 q8, d0
-; CHECK-NEXT: vcvt.f32.u32 q0, q8, #1
+; CHECK-NEXT: vmov.f32 s4, #2.000000e+00
+; CHECK-NEXT: vcvt.f32.u32 q2, q8
+; CHECK-NEXT: vdiv.f32 s3, s11, s4
+; CHECK-NEXT: vdiv.f32 s2, s10, s4
+; CHECK-NEXT: vdiv.f32 s1, s9, s4
+; CHECK-NEXT: vdiv.f32 s0, s8, s4
; CHECK-NEXT: bx lr
%conv = uitofp <4 x i16> %in to <4 x float>
%shift = fdiv <4 x float> %conv, <float 2.0, float 2.0, float 2.0, float 2.0>
@@ -106,7 +131,12 @@ define arm_aapcs_vfpcc <4 x float> @fix_signed_i16_to_float(<4 x i16> %in) {
; CHECK-LABEL: fix_signed_i16_to_float:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmovl.s16 q8, d0
-; CHECK-NEXT: vcvt.f32.s32 q0, q8, #1
+; CHECK-NEXT: vmov.f32 s4, #2.000000e+00
+; CHECK-NEXT: vcvt.f32.s32 q2, q8
+; CHECK-NEXT: vdiv.f32 s3, s11, s4
+; CHECK-NEXT: vdiv.f32 s2, s10, s4
+; CHECK-NEXT: vdiv.f32 s1, s9, s4
+; CHECK-NEXT: vdiv.f32 s0, s8, s4
; CHECK-NEXT: bx lr
%conv = sitofp <4 x i16> %in to <4 x float>
%shift = fdiv <4 x float> %conv, <float 2.0, float 2.0, float 2.0, float 2.0>
@@ -166,8 +196,19 @@ define arm_aapcs_vfpcc <2 x double> @fix_i64_to_double(<2 x i64> %in) {
define arm_aapcs_vfpcc <8 x float> @test7(<8 x i32> %in) nounwind {
; CHECK-LABEL: test7:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vcvt.f32.s32 q0, q0, #3
-; CHECK-NEXT: vcvt.f32.s32 q1, q1, #3
+; CHECK-NEXT: vpush {d8, d9}
+; CHECK-NEXT: vmov.f32 s12, #8.000000e+00
+; CHECK-NEXT: vcvt.f32.s32 q4, q0
+; CHECK-NEXT: vcvt.f32.s32 q2, q1
+; CHECK-NEXT: vdiv.f32 s3, s19, s12
+; CHECK-NEXT: vdiv.f32 s7, s11, s12
+; CHECK-NEXT: vdiv.f32 s2, s18, s12
+; CHECK-NEXT: vdiv.f32 s6, s10, s12
+; CHECK-NEXT: vdiv.f32 s1, s17, s12
+; CHECK-NEXT: vdiv.f32 s5, s9, s12
+; CHECK-NEXT: vdiv.f32 s0, s16, s12
+; CHECK-NEXT: vdiv.f32 s4, s8, s12
+; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr
entry:
%vcvt.i = sitofp <8 x i32> %in to <8 x float>
@@ -179,8 +220,19 @@ entry:
define arm_aapcs_vfpcc <4 x float> @test8(<4 x i32> %in) {
; CHECK-LABEL: test8:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vcvt.f32.s32 q0, q0, #1
+; CHECK-NEXT: vmov.f32 s4, #2.000000e+00
+; CHECK-NEXT: vcvt.f32.s32 q2, q0
+; CHECK-NEXT: vdiv.f32 s2, s10, s4
+; CHECK-NEXT: vdiv.f32 s1, s9, s4
+; CHECK-NEXT: vdiv.f32 s0, s8, s4
+; CHECK-NEXT: vldr s3, LCPI11_0
; CHECK-NEXT: bx lr
+; CHECK-NEXT: .p2align 2
+; CHECK-NEXT: @ %bb.1:
+; CHECK-NEXT: .data_region
+; CHECK-NEXT: LCPI11_0:
+; CHECK-NEXT: .long 0x7fc00000 @ float NaN
+; CHECK-NEXT: .end_data_region
%vcvt.i = sitofp <4 x i32> %in to <4 x float>
%div.i = fdiv <4 x float> %vcvt.i, <float 2.0, float 2.0, float 2.0, float undef>
ret <4 x float> %div.i
@@ -189,8 +241,19 @@ define arm_aapcs_vfpcc <4 x float> @test8(<4 x i32> %in) {
define arm_aapcs_vfpcc <3 x float> @test_illegal_int_to_fp(<3 x i32> %in) {
; CHECK-LABEL: test_illegal_int_to_fp:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vcvt.f32.s32 q0, q0, #2
+; CHECK-NEXT: vmov.f32 s4, #4.000000e+00
+; CHECK-NEXT: vcvt.f32.s32 q2, q0
+; CHECK-NEXT: vdiv.f32 s2, s10, s4
+; CHECK-NEXT: vdiv.f32 s1, s9, s4
+; CHECK-NEXT: vdiv.f32 s0, s8, s4
+; CHECK-NEXT: vldr s3, LCPI12_0
; CHECK-NEXT: bx lr
+; CHECK-NEXT: .p2align 2
+; CHECK-NEXT: @ %bb.1:
+; CHECK-NEXT: .data_region
+; CHECK-NEXT: LCPI12_0:
+; CHECK-NEXT: .long 0x7fc00000 @ float NaN
+; CHECK-NEXT: .end_data_region
%conv = sitofp <3 x i32> %in to <3 x float>
%res = fdiv <3 x float> %conv, <float 4.0, float 4.0, float 4.0>
ret <3 x float> %res
@@ -200,9 +263,7 @@ define arm_aapcs_vfpcc <3 x float> @test_illegal_int_to_fp(<3 x i32> %in) {
define arm_aapcs_vfpcc <2 x float> @t1_mul(<2 x i32> %vecinit2.i) local_unnamed_addr #0 {
; CHECK-LABEL: t1_mul:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vcvt.f32.s32 d16, d0
-; CHECK-NEXT: vmov.i32 d17, #0x3e000000
-; CHECK-NEXT: vmul.f32 d0, d16, d17
+; CHECK-NEXT: vcvt.f32.s32 d0, d0, #3
; CHECK-NEXT: bx lr
entry:
%vcvt.i = sitofp <2 x i32> %vecinit2.i to <2 x float>
@@ -213,9 +274,7 @@ entry:
define arm_aapcs_vfpcc <2 x float> @t2_mul(<2 x i32> %vecinit2.i) local_unnamed_addr #0 {
; CHECK-LABEL: t2_mul:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vcvt.f32.u32 d16, d0
-; CHECK-NEXT: vmov.i32 d17, #0x3e000000
-; CHECK-NEXT: vmul.f32 d0, d16, d17
+; CHECK-NEXT: vcvt.f32.u32 d0, d0, #3
; CHECK-NEXT: bx lr
entry:
%vcvt.i = uitofp <2 x i32> %vecinit2.i to <2 x float>
@@ -239,10 +298,7 @@ entry:
define arm_aapcs_vfpcc <2 x float> @t5_mul(<2 x i32> %vecinit2.i) local_unnamed_addr #0 {
; CHECK-LABEL: t5_mul:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vcvt.f32.s32 d16, d0
-; CHECK-NEXT: mov r0, #796917760
-; CHECK-NEXT: vdup.32 d17, r0
-; CHECK-NEXT: vmul.f32 d0, d16, d17
+; CHECK-NEXT: vcvt.f32.s32 d0, d0, #32
; CHECK-NEXT: bx lr
entry:
%vcvt.i = sitofp <2 x i32> %vecinit2.i to <2 x float>
@@ -253,9 +309,7 @@ entry:
define arm_aapcs_vfpcc <4 x float> @t6_mul(<4 x i32> %vecinit6.i) local_unnamed_addr #0 {
; CHECK-LABEL: t6_mul:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vcvt.f32.s32 q8, q0
-; CHECK-NEXT: vmov.i32 q9, #0x3e000000
-; CHECK-NEXT: vmul.f32 q0, q8, q9
+; CHECK-NEXT: vcvt.f32.s32 q0, q0, #3
; CHECK-NEXT: bx lr
entry:
%vcvt.i = sitofp <4 x i32> %vecinit6.i to <4 x float>
@@ -267,9 +321,7 @@ define arm_aapcs_vfpcc <4 x float> @fix_unsigned_i16_to_float_mul(<4 x i16> %in)
; CHECK-LABEL: fix_unsigned_i16_to_float_mul:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmovl.u16 q8, d0
-; CHECK-NEXT: vmov.i32 q9, #0x3f000000
-; CHECK-NEXT: vcvt.f32.u32 q8, q8
-; CHECK-NEXT: vmul.f32 q0, q8, q9
+; CHECK-NEXT: vcvt.f32.u32 q0, q8, #1
; CHECK-NEXT: bx lr
%conv = uitofp <4 x i16> %in to <4 x float>
%shift = fmul <4 x float> %conv, <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
@@ -280,9 +332,7 @@ define arm_aapcs_vfpcc <4 x float> @fix_signed_i16_to_float_mul(<4 x i16> %in) l
; CHECK-LABEL: fix_signed_i16_to_float_mul:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmovl.s16 q8, d0
-; CHECK-NEXT: vmov.i32 q9, #0x3f000000
-; CHECK-NEXT: vcvt.f32.s32 q8, q8
-; CHECK-NEXT: vmul.f32 q0, q8, q9
+; CHECK-NEXT: vcvt.f32.s32 q0, q8, #1
; CHECK-NEXT: bx lr
%conv = sitofp <4 x i16> %in to <4 x float>
%shift = fmul <4 x float> %conv, <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
@@ -340,11 +390,8 @@ define arm_aapcs_vfpcc <2 x double> @fix_i64_to_double_mul(<2 x i64> %in) local_
define arm_aapcs_vfpcc <8 x float> @test7_mul(<8 x i32> %in) local_unnamed_addr #0 {
; CHECK-LABEL: test7_mul:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vcvt.f32.s32 q8, q0
-; CHECK-NEXT: vcvt.f32.s32 q9, q1
-; CHECK-NEXT: vmov.i32 q10, #0x3e000000
-; CHECK-NEXT: vmul.f32 q0, q8, q10
-; CHECK-NEXT: vmul.f32 q1, q9, q10
+; CHECK-NEXT: vcvt.f32.s32 q0, q0, #3
+; CHECK-NEXT: vcvt.f32.s32 q1, q1, #3
; CHECK-NEXT: bx lr
entry:
%vcvt.i = sitofp <8 x i32> %in to <8 x float>
@@ -355,9 +402,7 @@ entry:
define arm_aapcs_vfpcc <3 x float> @test_illegal_int_to_fp_mul(<3 x i32> %in) local_unnamed_addr #0 {
; CHECK-LABEL: test_illegal_int_to_fp_mul:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vcvt.f32.s32 q8, q0
-; CHECK-NEXT: vmov.f32 q9, #2.500000e-01
-; CHECK-NEXT: vmul.f32 q0, q8, q9
+; CHECK-NEXT: vcvt.f32.s32 q0, q0, #2
; CHECK-NEXT: bx lr
%conv = sitofp <3 x i32> %in to <3 x float>
%res = fmul <3 x float> %conv, <float 2.500000e-01, float 2.500000e-01, float 2.500000e-01>
More information about the llvm-commits
mailing list