[llvm] [AArch64] Avoid NEON fixed-point SCVTF in Streaming-SVE mode. (PR #91924)

Mon May 13 03:19:12 PDT 2024

https://github.com/sdesmalen-arm updated https://github.com/llvm/llvm-project/pull/91924

>From f744db3bd7a067b5b59afac012ddee252c1ec7f8 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Fri, 26 Apr 2024 12:07:15 +0100
Subject: [PATCH] [AArch64] Remove redundant FDIV Combine.

The target combine is no longer required because InstCombine will
transform the DIV by a power of 2 into a multiply, so in practice
this case will never trigger.

Additionally, the generated code would have been incorrect for
streaming(-compatible) functions, because it assumed NEON was available.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  73 +---------
 llvm/test/CodeGen/AArch64/fdiv_combine.ll     | 126 ------------------
 .../CodeGen/AArch64/sitofp-fixed-legal.ll     |  42 ------
 3 files changed, 1 insertion(+), 240 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/fdiv_combine.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/sitofp-fixed-legal.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7344387ffe552..823ae05f409f3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1040,7 +1040,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
                        ISD::UINT_TO_FP});
 
   setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
-                       ISD::FP_TO_UINT_SAT, ISD::FADD, ISD::FDIV});
+                       ISD::FP_TO_UINT_SAT, ISD::FADD});
 
   // Try and combine setcc with csel
   setTargetDAGCombine(ISD::SETCC);
@@ -17907,75 +17907,6 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
   return FixConv;
 }
 
-/// Fold a floating-point divide by power of two into fixed-point to
-/// floating-point conversion.
-static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
-                                  TargetLowering::DAGCombinerInfo &DCI,
-                                  const AArch64Subtarget *Subtarget) {
-  if (!Subtarget->hasNEON())
-    return SDValue();
-
-  SDValue Op = N->getOperand(0);
-  unsigned Opc = Op->getOpcode();
-  if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
-      !Op.getOperand(0).getValueType().isSimple() ||
-      (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
-    return SDValue();
-
-  SDValue ConstVec = N->getOperand(1);
-  if (!isa<BuildVectorSDNode>(ConstVec))
-    return SDValue();
-
-  MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
-  int32_t IntBits = IntTy.getSizeInBits();
-  if (IntBits != 16 && IntBits != 32 && IntBits != 64)
-    return SDValue();
-
-  MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
-  int32_t FloatBits = FloatTy.getSizeInBits();
-  if (FloatBits != 32 && FloatBits != 64)
-    return SDValue();
-
-  // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
-  if (IntBits > FloatBits)
-    return SDValue();
-
-  BitVector UndefElements;
-  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
-  int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
-  if (C == -1 || C == 0 || C > FloatBits)
-    return SDValue();
-
-  MVT ResTy;
-  unsigned NumLanes = Op.getValueType().getVectorNumElements();
-  switch (NumLanes) {
-  default:
-    return SDValue();
-  case 2:
-    ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
-    break;
-  case 4:
-    ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
-    break;
-  }
-
-  if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  SDLoc DL(N);
-  SDValue ConvInput = Op.getOperand(0);
-  bool IsSigned = Opc == ISD::SINT_TO_FP;
-  if (IntBits < FloatBits)
-    ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
-                            ResTy, ConvInput);
-
-  unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
-                                      : Intrinsic::aarch64_neon_vcvtfxu2fp;
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
-                     DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
-                     DAG.getConstant(C, DL, MVT::i32));
-}
-
 static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                                const AArch64TargetLowering &TLI) {
   EVT VT = N->getValueType(0);
@@ -24624,8 +24555,6 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::FP_TO_SINT_SAT:
   case ISD::FP_TO_UINT_SAT:
     return performFpToIntCombine(N, DAG, DCI, Subtarget);
-  case ISD::FDIV:
-    return performFDivCombine(N, DAG, DCI, Subtarget);
   case ISD::OR:
     return performORCombine(N, DCI, Subtarget, *this);
   case ISD::AND:
diff --git a/llvm/test/CodeGen/AArch64/fdiv_combine.ll b/llvm/test/CodeGen/AArch64/fdiv_combine.ll
deleted file mode 100644
index 10b5f4386dd56..0000000000000
--- a/llvm/test/CodeGen/AArch64/fdiv_combine.ll
+++ /dev/null
@@ -1,126 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-neon-syntax=apple -verify-machineinstrs -o - %s | FileCheck %s
-
-; Test signed conversion.
-define <2 x float> @test1(<2 x i32> %in) {
-; CHECK-LABEL: test1:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf.2s v0, v0, #4
-; CHECK-NEXT:    ret
-entry:
-  %vcvt.i = sitofp <2 x i32> %in to <2 x float>
-  %div.i = fdiv <2 x float> %vcvt.i, <float 16.0, float 16.0>
-  ret <2 x float> %div.i
-}
-
-; Test unsigned conversion.
-define <2 x float> @test2(<2 x i32> %in) {
-; CHECK-LABEL: test2:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf.2s v0, v0, #3
-; CHECK-NEXT:    ret
-entry:
-  %vcvt.i = uitofp <2 x i32> %in to <2 x float>
-  %div.i = fdiv <2 x float> %vcvt.i, <float 8.0, float 8.0>
-  ret <2 x float> %div.i
-}
-
-; Test which should not fold due to non-power of 2.
-define <2 x float> @test3(<2 x i32> %in) {
-; CHECK-LABEL: test3:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov.2s v1, #9.00000000
-; CHECK-NEXT:    scvtf.2s v0, v0
-; CHECK-NEXT:    fdiv.2s v0, v0, v1
-; CHECK-NEXT:    ret
-entry:
-  %vcvt.i = sitofp <2 x i32> %in to <2 x float>
-  %div.i = fdiv <2 x float> %vcvt.i, <float 9.0, float 9.0>
-  ret <2 x float> %div.i
-}
-
-; Test which should not fold due to power of 2 out of range.
-define <2 x float> @test4(<2 x i32> %in) {
-; CHECK-LABEL: test4:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi.2s v1, #80, lsl #24
-; CHECK-NEXT:    scvtf.2s v0, v0
-; CHECK-NEXT:    fdiv.2s v0, v0, v1
-; CHECK-NEXT:    ret
-entry:
-  %vcvt.i = sitofp <2 x i32> %in to <2 x float>
-  %div.i = fdiv <2 x float> %vcvt.i, <float 0x4200000000000000, float 0x4200000000000000>
-  ret <2 x float> %div.i
-}
-
-; Test case where const is max power of 2 (i.e., 2^32).
-define <2 x float> @test5(<2 x i32> %in) {
-; CHECK-LABEL: test5:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf.2s v0, v0, #32
-; CHECK-NEXT:    ret
-entry:
-  %vcvt.i = sitofp <2 x i32> %in to <2 x float>
-  %div.i = fdiv <2 x float> %vcvt.i, <float 0x41F0000000000000, float 0x41F0000000000000>
-  ret <2 x float> %div.i
-}
-
-; Test quadword.
-define <4 x float> @test6(<4 x i32> %in) {
-; CHECK-LABEL: test6:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf.4s v0, v0, #2
-; CHECK-NEXT:    ret
-entry:
-  %vcvt.i = sitofp <4 x i32> %in to <4 x float>
-  %div.i = fdiv <4 x float> %vcvt.i, <float 4.0, float 4.0, float 4.0, float 4.0>
-  ret <4 x float> %div.i
-}
-
-; Test unsigned i16 to float
-define <4 x float> @test7(<4 x i16> %in) {
-; CHECK-LABEL: test7:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    ucvtf.4s v0, v0, #1
-; CHECK-NEXT:    ret
-  %conv = uitofp <4 x i16> %in to <4 x float>
-  %shift = fdiv <4 x float> %conv, <float 2.0, float 2.0, float 2.0, float 2.0>
-  ret <4 x float> %shift
-}
-
-; Test signed i16 to float
-define <4 x float> @test8(<4 x i16> %in) {
-; CHECK-LABEL: test8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sshll.4s v0, v0, #0
-; CHECK-NEXT:    scvtf.4s v0, v0, #2
-; CHECK-NEXT:    ret
-  %conv = sitofp <4 x i16> %in to <4 x float>
-  %shift = fdiv <4 x float> %conv, <float 4.0, float 4.0, float 4.0, float 4.0>
-  ret <4 x float> %shift
-}
-
-; Can't convert i64 to float.
-define <2 x float> @test9(<2 x i64> %in) {
-; CHECK-LABEL: test9:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ucvtf.2d v0, v0
-; CHECK-NEXT:    movi.2s v1, #64, lsl #24
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fdiv.2s v0, v0, v1
-; CHECK-NEXT:    ret
-  %conv = uitofp <2 x i64> %in to <2 x float>
-  %shift = fdiv <2 x float> %conv, <float 2.0, float 2.0>
-  ret <2 x float> %shift
-}
-
-define <2 x double> @test10(<2 x i64> %in) {
-; CHECK-LABEL: test10:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ucvtf.2d v0, v0, #1
-; CHECK-NEXT:    ret
-  %conv = uitofp <2 x i64> %in to <2 x double>
-  %shift = fdiv <2 x double> %conv, <double 2.0, double 2.0>
-  ret <2 x double> %shift
-}
diff --git a/llvm/test/CodeGen/AArch64/sitofp-fixed-legal.ll b/llvm/test/CodeGen/AArch64/sitofp-fixed-legal.ll
deleted file mode 100644
index 5a5a669e92eeb..0000000000000
--- a/llvm/test/CodeGen/AArch64/sitofp-fixed-legal.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-apple-ios %s -o - | FileCheck %s
-
-define <16 x double> @test_sitofp_fixed(<16 x i32> %in) {
-; CHECK-LABEL: test_sitofp_fixed:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    sshll2.2d v4, v0, #0
-; CHECK-NEXT:    sshll.2d v0, v0, #0
-; CHECK-NEXT:    sshll2.2d v5, v1, #0
-; CHECK-NEXT:    sshll.2d v6, v1, #0
-; CHECK-NEXT:    sshll.2d v7, v2, #0
-; CHECK-NEXT:    sshll2.2d v16, v2, #0
-; CHECK-NEXT:    sshll2.2d v17, v3, #0
-; CHECK-NEXT:    sshll.2d v18, v3, #0
-; CHECK-NEXT:    scvtf.2d v1, v4, #6
-; CHECK-NEXT:    scvtf.2d v0, v0, #6
-; CHECK-NEXT:    scvtf.2d v3, v5, #6
-; CHECK-NEXT:    scvtf.2d v2, v6, #6
-; CHECK-NEXT:    scvtf.2d v4, v7, #6
-; CHECK-NEXT:    scvtf.2d v5, v16, #6
-; CHECK-NEXT:    scvtf.2d v7, v17, #6
-; CHECK-NEXT:    scvtf.2d v6, v18, #6
-; CHECK-NEXT:    ret
-
-  %flt = sitofp <16 x i32> %in to <16 x double>
-  %res = fdiv <16 x double> %flt, <double 64.0, double 64.0, double 64.0, double 64.0, double 64.0, double 64.0, double 64.0, double 64.0, double 64.0, double 64.0, double 64.0, double 64.0, double 64.0, double 64.0, double 64.0, double 64.0>
-  ret <16 x double> %res
-}
-
-; This one is small enough to satisfy isSimple, but still illegally large.
-define <4 x double> @test_sitofp_fixed_shortish(<4 x i64> %in) {
-; CHECK-LABEL: test_sitofp_fixed_shortish:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    scvtf.2d v0, v0, #6
-; CHECK-NEXT:    scvtf.2d v1, v1, #6
-; CHECK-NEXT:    ret
-
-
-  %flt = sitofp <4 x i64> %in to <4 x double>
-  %res = fdiv <4 x double> %flt, <double 64.0, double 64.0, double 64.0, double 64.0>
-  ret <4 x double> %res
-}