[llvm] [AArch64][SVE] Use SVE for scalar FP converts in streaming[-compatible] functions (PR #112564)

Thu Oct 24 09:58:38 PDT 2024

================
@@ -18929,13 +18929,94 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
   return SDValue();
 }
 
+/// Tries to replace scalar FP <-> conversions with SVE in streaming functions.
+static SDValue
+tryReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG,
+                                    const AArch64Subtarget *Subtarget) {
+  if (N->isStrictFPOpcode())
+    return SDValue();
+
+  auto isSupportedType = [](EVT VT) {
+    if (!VT.isSimple())
+      return false;
+    // There are SVE instructions that can convert to/from all pairs of these
+    // int and float types. Note: We don't bother with i8 or i16 as those are
+    // illegal types for scalars.
+    return is_contained({MVT::i32, MVT::i64, MVT::f16, MVT::f32, MVT::f64},
+                        VT.getSimpleVT().SimpleTy);
+  };
+
+  if (!isSupportedType(N->getValueType(0)) ||
+      !isSupportedType(N->getOperand(0).getValueType()))
+    return SDValue();
+
+  // If we are in a streaming[-compatible] function, use SVE for scalar FP <->
+  // INT conversions as this can help avoid moves between GPRs and FPRs, which
+  // could be quite expensive.
+  if (!Subtarget->isSVEorStreamingSVEAvailable() ||
+      (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
+    return SDValue();
+
+  unsigned Opc = N->getOpcode();
+  bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::FP_TO_SINT;
+
+  SDValue SrcVal = N->getOperand(0);
+  EVT SrcTy = SrcVal.getValueType();
+  EVT DestTy = N->getValueType(0);
+
+  EVT SrcVecTy;
+  EVT DestVecTy;
+  if (DestTy.bitsGT(SrcTy)) {
+    DestVecTy = getPackedSVEVectorVT(DestTy);
+    SrcVecTy = SrcTy == MVT::i32 ? getPackedSVEVectorVT(SrcTy)
+                                 : DestVecTy.changeVectorElementType(SrcTy);
+  } else {
+    SrcVecTy = getPackedSVEVectorVT(SrcTy);
+    DestVecTy = DestTy == MVT::i32 ? getPackedSVEVectorVT(DestTy)
+                                   : SrcVecTy.changeVectorElementType(DestTy);
+  }
+
+  SDLoc DL(N);
+  SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
+  SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
+                            DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
+
+  // Conversions between f64 and i32 are a special case as nxv2i32 is an illegal
+  // type (unlike the equivalent nxv2f32 for floating-point types). So,
+  // unfortunately, the only way to lower to these variants is via the
+  // intrinsics. Note: We could sign/zero extend to the i64 variant, but that
+  // may result in extra extends or fmovs in the final assembly.
+  bool IsI32ToF64 = SrcTy == MVT::i32 && DestTy == MVT::f64;
+  bool isF64ToI32 = SrcTy == MVT::f64 && DestTy == MVT::i32;
+  if (IsI32ToF64 || isF64ToI32) {
+    unsigned IntrinsicOpc;
+    if (IsI32ToF64)
+      IntrinsicOpc = IsSigned ? Intrinsic::aarch64_sve_scvtf_f64i32
+                              : Intrinsic::aarch64_sve_ucvtf_f64i32;
+    else
+      IntrinsicOpc = IsSigned ? Intrinsic::aarch64_sve_fcvtzs_i32f64
+                              : Intrinsic::aarch64_sve_fcvtzu_i32f64;
+    SDValue PTrue = getPredicateForVector(DAG, DL, MVT::nxv2f64);
+    Vec = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, DestVecTy,
+                      {DAG.getConstant(IntrinsicOpc, DL, MVT::i32),
+                       DAG.getUNDEF(DestTy), PTrue, Vec});
----------------
MacDue wrote:

Not sure about this, but the easiest way I could think of to get the desired codegen was to directly use intrinsics for the `f64` <-> `i32` cases. 

https://github.com/llvm/llvm-project/pull/112564