[llvm] [AArch64][SVE] Use SVE for scalar FP converts in streaming[-compatible] functions (PR #112564)

Benjamin Maxwell via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 24 10:57:25 PDT 2024


https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/112564

>From 44892e524edb3041ce6ca8b74baeba568668b9ab Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 16 Oct 2024 14:45:43 +0000
Subject: [PATCH 1/8] [AArch64][SVE] Avoid transfer to GPRs for fp -> int -> fp
 conversions

When Neon is not available use SVE variants of FCVTZS, FCVTZU, UCVTF,
and  SCVTF for fp -> int -> fp conversions to avoid moving values
to/from GPRs which may be expensive.

Note: With +sme2p2 the single-element vector Neon variants of these
instructions could be used instead (but that feature is not implemented
yet).

Follow up to #112213.
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 35 ++++++++++++++
 .../sve-streaming-mode-cvt-fp-int-fp.ll       | 46 ++++++++++++-------
 2 files changed, 65 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 59859cb7442d59..b99f2ee7e1b488 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2421,6 +2421,41 @@ let Predicates = [HasSVEorSME] in {
   defm FSQRT_ZPmZ  : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt",  AArch64fsqrt_mt>;
 } // End HasSVEorSME
 
+// Helper for creating fp -> int -> fp conversions using SVE.
+class sve_fp_int_fp_cvt<Instruction PTRUE, Instruction FROM_INT, Instruction TO_INT, SubRegIndex sub>
+  : OutPatFrag<(ops node: $Rn),
+    (EXTRACT_SUBREG
+      (FROM_INT (IMPLICIT_DEF), (PTRUE 1),
+        (TO_INT (IMPLICIT_DEF), (PTRUE 1),
+          (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub))), sub)>;
+
+// Some float -> int -> float conversion patterns where we want to keep the int
+// values in FP registers using the SVE instructions to avoid costly GPR <-> FPR
+// register transfers. Only used when NEON is not available (e.g. in streaming
+// functions).
+// TODO: When +sme2p2 is available single-element vectors should be preferred.
+def HasNoNEON : Predicate<"!Subtarget->isNeonAvailable()">;
+let Predicates = [HasSVEorSME, HasNoNEON] in {
+def : Pat<
+  (f64 (sint_to_fp (i64 (fp_to_sint f64:$Rn)))),
+  (sve_fp_int_fp_cvt<PTRUE_D, SCVTF_ZPmZ_DtoD, FCVTZS_ZPmZ_DtoD, dsub> $Rn)>;
+def : Pat<
+  (f64 (uint_to_fp (i64 (fp_to_uint f64:$Rn)))),
+  (sve_fp_int_fp_cvt<PTRUE_D, UCVTF_ZPmZ_DtoD, FCVTZU_ZPmZ_DtoD, dsub> $Rn)>;
+def : Pat<
+  (f32 (sint_to_fp (i32 (fp_to_sint f32:$Rn)))),
+  (sve_fp_int_fp_cvt<PTRUE_S, SCVTF_ZPmZ_StoS, FCVTZS_ZPmZ_StoS, ssub> $Rn)>;
+def : Pat<
+  (f32 (uint_to_fp (i32 (fp_to_uint f32:$Rn)))),
+  (sve_fp_int_fp_cvt<PTRUE_S, UCVTF_ZPmZ_StoS, FCVTZU_ZPmZ_StoS, ssub> $Rn)>;
+def : Pat<
+  (f16 (sint_to_fp (i32 (fp_to_sint f16:$Rn)))),
+  (sve_fp_int_fp_cvt<PTRUE_H, SCVTF_ZPmZ_HtoH, FCVTZS_ZPmZ_HtoH, hsub> $Rn)>;
+def : Pat<
+  (f16 (uint_to_fp (i32 (fp_to_uint f16:$Rn)))),
+  (sve_fp_int_fp_cvt<PTRUE_H, UCVTF_ZPmZ_HtoH, FCVTZU_ZPmZ_HtoH, hsub> $Rn)>;
+} // End HasSVEorSME, HasNoNEON
+
 let Predicates = [HasBF16, HasSVEorSME] in {
   defm BFDOT_ZZZ    : sve_float_dot<0b1, 0b0, ZPR32, ZPR16, "bfdot", nxv8bf16, int_aarch64_sve_bfdot>;
   defm BFDOT_ZZI    : sve_float_dot_indexed<0b1, 0b00, ZPR16, ZPR3b16, "bfdot", nxv8bf16, int_aarch64_sve_bfdot_lane_v2>;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
index f402463de7be81..0acc107280ac83 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
@@ -8,8 +8,11 @@ target triple = "aarch64-unknown-linux-gnu"
 define double @t1(double %x) {
 ; CHECK-LABEL: t1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzs x8, d0
-; CHECK-NEXT:    scvtf d0, x8
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; USE-NEON-NO-GPRS-LABEL: t1:
@@ -26,8 +29,11 @@ entry:
 define float @t2(float %x) {
 ; CHECK-LABEL: t2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzs w8, s0
-; CHECK-NEXT:    scvtf s0, w8
+; CHECK-NEXT:    ptrue p0.s, vl1
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; USE-NEON-NO-GPRS-LABEL: t2:
@@ -44,10 +50,11 @@ entry:
 define half @t3(half %x)  {
 ; CHECK-LABEL: t3:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    fcvtzs w8, s0
-; CHECK-NEXT:    scvtf s0, w8
-; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ptrue p0.h, vl1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    fcvtzs z0.h, p0/m, z0.h
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.h
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; USE-NEON-NO-GPRS-LABEL: t3:
@@ -66,8 +73,11 @@ entry:
 define double @t4(double %x) {
 ; CHECK-LABEL: t4:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzu x8, d0
-; CHECK-NEXT:    ucvtf d0, x8
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; USE-NEON-NO-GPRS-LABEL: t4:
@@ -84,8 +94,11 @@ entry:
 define float @t5(float %x) {
 ; CHECK-LABEL: t5:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzu w8, s0
-; CHECK-NEXT:    ucvtf s0, w8
+; CHECK-NEXT:    ptrue p0.s, vl1
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; USE-NEON-NO-GPRS-LABEL: t5:
@@ -102,10 +115,11 @@ entry:
 define half @t6(half %x)  {
 ; CHECK-LABEL: t6:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    fcvtzu w8, s0
-; CHECK-NEXT:    ucvtf s0, w8
-; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ptrue p0.h, vl1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    fcvtzu z0.h, p0/m, z0.h
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.h
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; USE-NEON-NO-GPRS-LABEL: t6:

>From 68540edf58bc4e0814203658f72d2572e0f4413a Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 16 Oct 2024 15:22:57 +0000
Subject: [PATCH 2/8] Note that this is for scalars

---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index b99f2ee7e1b488..10ea3ad8392ffc 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2421,39 +2421,40 @@ let Predicates = [HasSVEorSME] in {
   defm FSQRT_ZPmZ  : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt",  AArch64fsqrt_mt>;
 } // End HasSVEorSME
 
-// Helper for creating fp -> int -> fp conversions using SVE.
-class sve_fp_int_fp_cvt<Instruction PTRUE, Instruction FROM_INT, Instruction TO_INT, SubRegIndex sub>
+// Helper for creating scalar fp -> int -> fp conversions using SVE.
+class sve_scalar_fp_int_fp_cvt
+  <Instruction PTRUE, Instruction FROM_INT, Instruction TO_INT, SubRegIndex sub>
   : OutPatFrag<(ops node: $Rn),
     (EXTRACT_SUBREG
       (FROM_INT (IMPLICIT_DEF), (PTRUE 1),
         (TO_INT (IMPLICIT_DEF), (PTRUE 1),
           (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub))), sub)>;
 
-// Some float -> int -> float conversion patterns where we want to keep the int
-// values in FP registers using the SVE instructions to avoid costly GPR <-> FPR
-// register transfers. Only used when NEON is not available (e.g. in streaming
-// functions).
-// TODO: When +sme2p2 is available single-element vectors should be preferred.
+// Some scalar float -> int -> float conversion patterns where we want to keep
+// the int values in FP registers to avoid costly GPR <-> FPR register
+// transfers using SVE instructions. Only used when NEON is not available (e.g.
+// in streaming functions).
+// TODO: When +sme2p2 is available Neon single-element vectors should be preferred.
 def HasNoNEON : Predicate<"!Subtarget->isNeonAvailable()">;
 let Predicates = [HasSVEorSME, HasNoNEON] in {
 def : Pat<
   (f64 (sint_to_fp (i64 (fp_to_sint f64:$Rn)))),
-  (sve_fp_int_fp_cvt<PTRUE_D, SCVTF_ZPmZ_DtoD, FCVTZS_ZPmZ_DtoD, dsub> $Rn)>;
+  (sve_scalar_fp_int_fp_cvt<PTRUE_D, SCVTF_ZPmZ_DtoD, FCVTZS_ZPmZ_DtoD, dsub> $Rn)>;
 def : Pat<
   (f64 (uint_to_fp (i64 (fp_to_uint f64:$Rn)))),
-  (sve_fp_int_fp_cvt<PTRUE_D, UCVTF_ZPmZ_DtoD, FCVTZU_ZPmZ_DtoD, dsub> $Rn)>;
+  (sve_scalar_fp_int_fp_cvt<PTRUE_D, UCVTF_ZPmZ_DtoD, FCVTZU_ZPmZ_DtoD, dsub> $Rn)>;
 def : Pat<
   (f32 (sint_to_fp (i32 (fp_to_sint f32:$Rn)))),
-  (sve_fp_int_fp_cvt<PTRUE_S, SCVTF_ZPmZ_StoS, FCVTZS_ZPmZ_StoS, ssub> $Rn)>;
+  (sve_scalar_fp_int_fp_cvt<PTRUE_S, SCVTF_ZPmZ_StoS, FCVTZS_ZPmZ_StoS, ssub> $Rn)>;
 def : Pat<
   (f32 (uint_to_fp (i32 (fp_to_uint f32:$Rn)))),
-  (sve_fp_int_fp_cvt<PTRUE_S, UCVTF_ZPmZ_StoS, FCVTZU_ZPmZ_StoS, ssub> $Rn)>;
+  (sve_scalar_fp_int_fp_cvt<PTRUE_S, UCVTF_ZPmZ_StoS, FCVTZU_ZPmZ_StoS, ssub> $Rn)>;
 def : Pat<
   (f16 (sint_to_fp (i32 (fp_to_sint f16:$Rn)))),
-  (sve_fp_int_fp_cvt<PTRUE_H, SCVTF_ZPmZ_HtoH, FCVTZS_ZPmZ_HtoH, hsub> $Rn)>;
+  (sve_scalar_fp_int_fp_cvt<PTRUE_H, SCVTF_ZPmZ_HtoH, FCVTZS_ZPmZ_HtoH, hsub> $Rn)>;
 def : Pat<
   (f16 (uint_to_fp (i32 (fp_to_uint f16:$Rn)))),
-  (sve_fp_int_fp_cvt<PTRUE_H, UCVTF_ZPmZ_HtoH, FCVTZU_ZPmZ_HtoH, hsub> $Rn)>;
+  (sve_scalar_fp_int_fp_cvt<PTRUE_H, UCVTF_ZPmZ_HtoH, FCVTZU_ZPmZ_HtoH, hsub> $Rn)>;
 } // End HasSVEorSME, HasNoNEON
 
 let Predicates = [HasBF16, HasSVEorSME] in {

>From 56b94106c57aaca221674e7872a59b1ea6b2949e Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 21 Oct 2024 21:11:44 +0000
Subject: [PATCH 3/8] Lower scalar FP converts to SVE

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  98 ++++-
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  40 +-
 .../sve-streaming-mode-cvt-fp-int-fp.ll       |  20 +-
 .../sve-streaming-mode-cvt-fp-to-int.ll       | 264 +++++++++++++
 .../sve-streaming-mode-cvt-int-to-fp.ll       | 265 +++++++++++++
 ...e-streaming-mode-fixed-length-fp-to-int.ll | 366 ++++++++----------
 ...e-streaming-mode-fixed-length-int-to-fp.ll | 121 ++++--
 7 files changed, 880 insertions(+), 294 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5a848ada9dd8ee..ab329a6dc79080 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1454,8 +1454,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
       setOperationAction(ISD::UINT_TO_FP, VT, Custom);
       setOperationAction(ISD::SINT_TO_FP, VT, Custom);
+      setOperationAction(ISD::STRICT_UINT_TO_FP, VT, Custom);
+      setOperationAction(ISD::STRICT_SINT_TO_FP, VT, Custom);
       setOperationAction(ISD::FP_TO_UINT, VT, Custom);
       setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
       setOperationAction(ISD::MLOAD, VT, Custom);
       setOperationAction(ISD::MUL, VT, Custom);
       setOperationAction(ISD::MULHS, VT, Custom);
@@ -2138,6 +2142,8 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
   setOperationAction(ISD::FP_ROUND, VT, Default);
   setOperationAction(ISD::FP_TO_SINT, VT, Default);
   setOperationAction(ISD::FP_TO_UINT, VT, Default);
+  setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Default);
+  setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Default);
   setOperationAction(ISD::FRINT, VT, Default);
   setOperationAction(ISD::LRINT, VT, Default);
   setOperationAction(ISD::LLRINT, VT, Default);
@@ -2164,6 +2170,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
   setOperationAction(ISD::SIGN_EXTEND, VT, Default);
   setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Default);
   setOperationAction(ISD::SINT_TO_FP, VT, Default);
+  setOperationAction(ISD::STRICT_SINT_TO_FP, VT, Default);
   setOperationAction(ISD::SMAX, VT, Default);
   setOperationAction(ISD::SMIN, VT, Default);
   setOperationAction(ISD::SPLAT_VECTOR, VT, Default);
@@ -2174,6 +2181,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
   setOperationAction(ISD::TRUNCATE, VT, Default);
   setOperationAction(ISD::UDIV, VT, Default);
   setOperationAction(ISD::UINT_TO_FP, VT, Default);
+  setOperationAction(ISD::STRICT_UINT_TO_FP, VT, Default);
   setOperationAction(ISD::UMAX, VT, Default);
   setOperationAction(ISD::UMIN, VT, Default);
   setOperationAction(ISD::VECREDUCE_ADD, VT, Default);
@@ -4550,9 +4558,10 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
   EVT VT = Op.getValueType();
 
   if (VT.isScalableVector()) {
-    unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
-                          ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
-                          : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
+    unsigned Opc = Op.getOpcode();
+    bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT;
+    unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
+                               : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
     return LowerToPredicatedOp(Op, DAG, Opcode);
   }
 
@@ -4628,6 +4637,51 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
   return Op;
 }
 
+static bool CanLowerToScalarSVEFPIntConversion(EVT VT) {
+  if (!VT.isSimple())
+    return false;
+  // There are SVE instructions that can convert to/from all pairs of these int
+  // and float types. Note: We don't bother with i8 or i16 as those are illegal
+  // types for scalars.
+  return is_contained({MVT::i32, MVT::i64, MVT::f16, MVT::f32, MVT::f64},
+                      VT.getSimpleVT().SimpleTy);
+}
+
+/// Lowers a scalar FP conversion (to/from) int to SVE.
+static SDValue LowerScalarFPConversionToSVE(SDValue Op, SelectionDAG &DAG) {
+  bool IsStrict = Op->isStrictFPOpcode();
+  SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
+  EVT SrcTy = SrcVal.getValueType();
+  EVT DestTy = Op.getValueType();
+  EVT SrcVecTy;
+  EVT DestVecTy;
+  // Use a packed vector for the larger type.
+  // Note: For conversions such as FCVTZS_ZPmZ_DtoS, and UCVTF_ZPmZ_StoD that
+  // notionally take or return a nxv2i32 type we must instead use a nxv4i32, as
+  // (unlike floats) nxv2i32 is an illegal unpacked type.
+  if (DestTy.bitsGT(SrcTy)) {
+    DestVecTy = getPackedSVEVectorVT(DestTy);
+    SrcVecTy = SrcTy == MVT::i32 ? getPackedSVEVectorVT(SrcTy)
+                                 : DestVecTy.changeVectorElementType(SrcTy);
+  } else {
+    SrcVecTy = getPackedSVEVectorVT(SrcTy);
+    DestVecTy = DestTy == MVT::i32 ? getPackedSVEVectorVT(DestTy)
+                                   : SrcVecTy.changeVectorElementType(DestTy);
+  }
+  SDLoc dl(Op);
+  SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
+  SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SrcVecTy,
+                            DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
+  Vec = IsStrict ? DAG.getNode(Op.getOpcode(), dl, {DestVecTy, MVT::Other},
+                               {Op.getOperand(0), Vec})
+                 : DAG.getNode(Op.getOpcode(), dl, DestVecTy, Vec);
+  SDValue Scalar =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, ZeroIdx);
+  if (IsStrict)
+    return DAG.getMergeValues({Scalar, Vec.getValue(1)}, dl);
+  return Scalar;
+}
+
 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
                                               SelectionDAG &DAG) const {
   bool IsStrict = Op->isStrictFPOpcode();
@@ -4636,6 +4690,12 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
   if (SrcVal.getValueType().isVector())
     return LowerVectorFP_TO_INT(Op, DAG);
 
+  if (!Subtarget->isNeonAvailable() &&
+      Subtarget->isSVEorStreamingSVEAvailable() &&
+      CanLowerToScalarSVEFPIntConversion(SrcVal.getValueType()) &&
+      CanLowerToScalarSVEFPIntConversion(Op.getValueType()))
+    return LowerScalarFPConversionToSVE(Op, DAG);
+
   // f16 conversions are promoted to f32 when full fp16 is not supported.
   if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
       SrcVal.getValueType() == MVT::bf16) {
@@ -4939,6 +4999,12 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
   bool IsStrict = Op->isStrictFPOpcode();
   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
 
+  if (!Subtarget->isNeonAvailable() &&
+      Subtarget->isSVEorStreamingSVEAvailable() &&
+      CanLowerToScalarSVEFPIntConversion(SrcVal.getValueType()) &&
+      CanLowerToScalarSVEFPIntConversion(Op.getValueType()))
+    return LowerScalarFPConversionToSVE(Op, DAG);
+
   bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
                   Op->getOpcode() == ISD::SINT_TO_FP;
 
@@ -28327,7 +28393,21 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
                                                    unsigned NewOp) const {
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
-  auto Pg = getPredicateForVector(DAG, DL, VT);
+  SDValue Pg;
+
+  // FCVTZS_ZPmZ_DtoS and FCVTZU_ZPmZ_DtoS are special cases. These operations
+  // return nxv4i32 rather than the correct nxv2i32, as nxv2i32 is an illegal
+  // unpacked type. So, in this case, we take the predicate size from the
+  // operand.
+  SDValue LastOp{};
+  if ((NewOp == AArch64ISD::FCVTZU_MERGE_PASSTHRU ||
+       NewOp == AArch64ISD::FCVTZS_MERGE_PASSTHRU) &&
+      VT == MVT::nxv4i32 &&
+      (LastOp = Op->ops().back().get()).getValueType() == MVT::nxv2f64) {
+    Pg = getPredicateForVector(DAG, DL, LastOp.getValueType());
+  } else {
+    Pg = getPredicateForVector(DAG, DL, VT);
+  }
 
   if (VT.isFixedLengthVector()) {
     assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
@@ -28363,7 +28443,12 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
   assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
 
   SmallVector<SDValue, 4> Operands = {Pg};
+  SDValue Chain{};
   for (const SDValue &V : Op->op_values()) {
+    if (!isa<CondCodeSDNode>(V) && V.getValueType() == MVT::Other) {
+      Chain = V;
+      continue;
+    }
     assert((!V.getValueType().isVector() ||
             V.getValueType().isScalableVector()) &&
            "Only scalable vectors are supported!");
@@ -28373,7 +28458,10 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
   if (isMergePassthruOpcode(NewOp))
     Operands.push_back(DAG.getUNDEF(VT));
 
-  return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
+  auto NewNode = DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
+  if (Chain)
+    return DAG.getMergeValues({NewNode, Chain}, DL);
+  return NewNode;
 }
 
 // If a fixed length vector operation has no side effects when applied to
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 10ea3ad8392ffc..dfff9c627540be 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2338,8 +2338,8 @@ let Predicates = [HasSVEorSME] in {
   defm UCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd< 0b0110111, "ucvtf",  ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64,  AArch64ucvtf_mt,  nxv2f16, nxv2i1, nxv2i64, ElementSizeD>;
   defm SCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd< 0b1110110, "scvtf",  ZPR64, ZPR64, null_frag,                     AArch64scvtf_mt,  nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
   defm UCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd< 0b1110111, "ucvtf",  ZPR64, ZPR64, null_frag,                     AArch64ucvtf_mt,  nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
-  defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, null_frag,        nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
-  defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, null_frag,        nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, AArch64fcvtzs_mt, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, AArch64fcvtzu_mt, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
   defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>;
   defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>;
   defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>;
@@ -2421,42 +2421,6 @@ let Predicates = [HasSVEorSME] in {
   defm FSQRT_ZPmZ  : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt",  AArch64fsqrt_mt>;
 } // End HasSVEorSME
 
-// Helper for creating scalar fp -> int -> fp conversions using SVE.
-class sve_scalar_fp_int_fp_cvt
-  <Instruction PTRUE, Instruction FROM_INT, Instruction TO_INT, SubRegIndex sub>
-  : OutPatFrag<(ops node: $Rn),
-    (EXTRACT_SUBREG
-      (FROM_INT (IMPLICIT_DEF), (PTRUE 1),
-        (TO_INT (IMPLICIT_DEF), (PTRUE 1),
-          (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub))), sub)>;
-
-// Some scalar float -> int -> float conversion patterns where we want to keep
-// the int values in FP registers to avoid costly GPR <-> FPR register
-// transfers using SVE instructions. Only used when NEON is not available (e.g.
-// in streaming functions).
-// TODO: When +sme2p2 is available Neon single-element vectors should be preferred.
-def HasNoNEON : Predicate<"!Subtarget->isNeonAvailable()">;
-let Predicates = [HasSVEorSME, HasNoNEON] in {
-def : Pat<
-  (f64 (sint_to_fp (i64 (fp_to_sint f64:$Rn)))),
-  (sve_scalar_fp_int_fp_cvt<PTRUE_D, SCVTF_ZPmZ_DtoD, FCVTZS_ZPmZ_DtoD, dsub> $Rn)>;
-def : Pat<
-  (f64 (uint_to_fp (i64 (fp_to_uint f64:$Rn)))),
-  (sve_scalar_fp_int_fp_cvt<PTRUE_D, UCVTF_ZPmZ_DtoD, FCVTZU_ZPmZ_DtoD, dsub> $Rn)>;
-def : Pat<
-  (f32 (sint_to_fp (i32 (fp_to_sint f32:$Rn)))),
-  (sve_scalar_fp_int_fp_cvt<PTRUE_S, SCVTF_ZPmZ_StoS, FCVTZS_ZPmZ_StoS, ssub> $Rn)>;
-def : Pat<
-  (f32 (uint_to_fp (i32 (fp_to_uint f32:$Rn)))),
-  (sve_scalar_fp_int_fp_cvt<PTRUE_S, UCVTF_ZPmZ_StoS, FCVTZU_ZPmZ_StoS, ssub> $Rn)>;
-def : Pat<
-  (f16 (sint_to_fp (i32 (fp_to_sint f16:$Rn)))),
-  (sve_scalar_fp_int_fp_cvt<PTRUE_H, SCVTF_ZPmZ_HtoH, FCVTZS_ZPmZ_HtoH, hsub> $Rn)>;
-def : Pat<
-  (f16 (uint_to_fp (i32 (fp_to_uint f16:$Rn)))),
-  (sve_scalar_fp_int_fp_cvt<PTRUE_H, UCVTF_ZPmZ_HtoH, FCVTZU_ZPmZ_HtoH, hsub> $Rn)>;
-} // End HasSVEorSME, HasNoNEON
-
 let Predicates = [HasBF16, HasSVEorSME] in {
   defm BFDOT_ZZZ    : sve_float_dot<0b1, 0b0, ZPR32, ZPR16, "bfdot", nxv8bf16, int_aarch64_sve_bfdot>;
   defm BFDOT_ZZI    : sve_float_dot_indexed<0b1, 0b00, ZPR16, ZPR3b16, "bfdot", nxv8bf16, int_aarch64_sve_bfdot_lane_v2>;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
index 0acc107280ac83..0f4cb2060f2498 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
@@ -8,7 +8,7 @@ target triple = "aarch64-unknown-linux-gnu"
 define double @t1(double %x) {
 ; CHECK-LABEL: t1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
@@ -29,7 +29,7 @@ entry:
 define float @t2(float %x) {
 ; CHECK-LABEL: t2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s, vl1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
@@ -50,10 +50,10 @@ entry:
 define half @t3(half %x)  {
 ; CHECK-LABEL: t3:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h, vl1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
-; CHECK-NEXT:    fcvtzs z0.h, p0/m, z0.h
-; CHECK-NEXT:    scvtf z0.h, p0/m, z0.h
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
 ;
@@ -73,7 +73,7 @@ entry:
 define double @t4(double %x) {
 ; CHECK-LABEL: t4:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
@@ -94,7 +94,7 @@ entry:
 define float @t5(float %x) {
 ; CHECK-LABEL: t5:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s, vl1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
@@ -115,10 +115,10 @@ entry:
 define half @t6(half %x)  {
 ; CHECK-LABEL: t6:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h, vl1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
-; CHECK-NEXT:    fcvtzu z0.h, p0/m, z0.h
-; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.h
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll
new file mode 100644
index 00000000000000..60d3124f5b21e8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll
@@ -0,0 +1,264 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define i32 @f16_to_s32(half %x) {
+; CHECK-LABEL: f16_to_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f16_to_s32:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w0, s0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptosi half %x to i32
+  ret i32 %cvt
+}
+
+define i64 @f16_to_s64(half %x) {
+; CHECK-LABEL: f16_to_s64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f16_to_s64:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x0, s0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptosi half %x to i64
+  ret i64 %cvt
+}
+
+define i32 @f32_to_s32(float %x) {
+; CHECK-LABEL: f32_to_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f32_to_s32:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzs w0, s0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptosi float %x to i32
+  ret i32 %cvt
+}
+
+define i64 @f32_to_s64(float %x) {
+; CHECK-LABEL: f32_to_s64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f32_to_s64:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzs x0, s0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptosi float %x to i64
+  ret i64 %cvt
+}
+
+define i32 @f64_to_s32(double %x) {
+; CHECK-LABEL: f64_to_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.d
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f64_to_s32:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzs w0, d0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptosi double %x to i32
+  ret i32 %cvt
+}
+
+define i64 @f64_to_s64(double %x) {
+; CHECK-LABEL: f64_to_s64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f64_to_s64:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzs x0, d0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptosi double %x to i64
+  ret i64 %cvt
+}
+
+define i32 @f16_to_u32(half %x) {
+; CHECK-LABEL: f16_to_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f16_to_u32:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w0, s0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptoui half %x to i32
+  ret i32 %cvt
+}
+
+define i64 @f16_to_u64(half %x) {
+; CHECK-LABEL: f16_to_u64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f16_to_u64:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x0, s0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptoui half %x to i64
+  ret i64 %cvt
+}
+
+define i32 @f32_to_u32(float %x) {
+; CHECK-LABEL: f32_to_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f32_to_u32:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzu w0, s0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptoui float %x to i32
+  ret i32 %cvt
+}
+
+define i64 @f32_to_u64(float %x) {
+; CHECK-LABEL: f32_to_u64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f32_to_u64:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzu x0, s0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptoui float %x to i64
+  ret i64 %cvt
+}
+
+define i32 @f64_to_u32(double %x) {
+; CHECK-LABEL: f64_to_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.d
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f64_to_u32:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzu w0, d0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptoui double %x to i32
+  ret i32 %cvt
+}
+
+define i64 @f64_to_u64(double %x) {
+; CHECK-LABEL: f64_to_u64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f64_to_u64:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzu x0, d0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptoui double %x to i64
+  ret i64 %cvt
+}
+
+define i32 @strict_convert_signed(double %x) {
+; CHECK-LABEL: strict_convert_signed:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.d
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: strict_convert_signed:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzs w0, d0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %x, metadata !"fpexcept.strict") #0
+  ret i32 %cvt
+}
+
+define i32 @strict_convert_unsigned(float %x) {
+; CHECK-LABEL: strict_convert_unsigned:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: strict_convert_unsigned:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzu w0, s0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = call i32 @llvm.experimental.constrained.fptoui.i32.f32(float %x, metadata !"fpexcept.strict") #0
+  ret i32 %cvt
+}
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll
new file mode 100644
index 00000000000000..42be60ad559705
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll
@@ -0,0 +1,265 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define half @s32_to_f16(i32 %x) {
+; CHECK-LABEL: s32_to_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: s32_to_f16:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    scvtf s0, w0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = sitofp i32 %x to half
+  ret half %cvt
+}
+
+define float @s32_to_f32(i32 %x) {
+; CHECK-LABEL: s32_to_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: s32_to_f32:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    scvtf s0, w0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = sitofp i32 %x to float
+  ret float %cvt
+}
+
+define double @s32_to_f64(i32 %x) {
+; CHECK-LABEL: s32_to_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: s32_to_f64:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    scvtf d0, w0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = sitofp i32 %x to double
+  ret double %cvt
+}
+
+define half @u32_to_f16(i32 %x) {
+; CHECK-LABEL: u32_to_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: u32_to_f16:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    ucvtf s0, w0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = uitofp i32 %x to half
+  ret half %cvt
+}
+
+define float @u32_to_f32(i32 %x) {
+; CHECK-LABEL: u32_to_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: u32_to_f32:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    ucvtf s0, w0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = uitofp i32 %x to float
+  ret float %cvt
+}
+
+define double @u32_to_f64(i32 %x) {
+; CHECK-LABEL: u32_to_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: u32_to_f64:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    ucvtf d0, w0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = uitofp i32 %x to double
+  ret double %cvt
+}
+
+define half @s64_to_f16(i64 %x) {
+; CHECK-LABEL: s64_to_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: s64_to_f16:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    scvtf s0, x0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = sitofp i64 %x to half
+  ret half %cvt
+}
+
+define float @s64_to_f32(i64 %x) {
+; CHECK-LABEL: s64_to_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: s64_to_f32:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    scvtf s0, x0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = sitofp i64 %x to float
+  ret float %cvt
+}
+
+define double @s64_to_f64(i64 %x) {
+; CHECK-LABEL: s64_to_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: s64_to_f64:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    scvtf d0, x0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = sitofp i64 %x to double
+  ret double %cvt
+}
+
+define half @u64_to_f16(i64 %x) {
+; CHECK-LABEL: u64_to_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: u64_to_f16:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    ucvtf s0, x0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = uitofp i64 %x to half
+  ret half %cvt
+}
+
+define float @u64_to_f32(i64 %x) {
+; CHECK-LABEL: u64_to_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: u64_to_f32:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    ucvtf s0, x0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = uitofp i64 %x to float
+  ret float %cvt
+}
+
+define double @u64_to_f64(i64 %x) {
+; CHECK-LABEL: u64_to_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: u64_to_f64:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    ucvtf d0, x0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = uitofp i64 %x to double
+  ret double %cvt
+}
+
+define half @strict_convert_signed(i32 %x) {
+; CHECK-LABEL: strict_convert_signed:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: strict_convert_signed:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    scvtf s0, w0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = call half @llvm.experimental.constrained.sitofp.f16.i32(i32 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %cvt
+}
+
+define float @strict_convert_unsigned(i64 %x) {
+; CHECK-LABEL: strict_convert_unsigned:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: strict_convert_unsigned:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    ucvtf s0, x0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = call float @llvm.experimental.constrained.uitofp.f32.i64(i64 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %cvt
+}
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index 11fee267660c03..5e162fbfef196b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -418,8 +418,10 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) {
 define <1 x i64> @fcvtzu_v1f16_v1i64(<1 x half> %op1) {
 ; CHECK-LABEL: fcvtzu_v1f16_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvtzu x8, h0
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v1f16_v1i64:
@@ -441,10 +443,9 @@ define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z1.h, z0.h[1]
-; CHECK-NEXT:    fcvtzu x8, h0
-; CHECK-NEXT:    fcvtzu x9, h1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.h
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -472,20 +473,17 @@ define void @fcvtzu_v4f16_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v4f16_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.h, z0.h[3]
 ; CHECK-NEXT:    mov z2.h, z0.h[2]
 ; CHECK-NEXT:    mov z3.h, z0.h[1]
-; CHECK-NEXT:    fcvtzu x10, h0
-; CHECK-NEXT:    fcvtzu x8, h1
-; CHECK-NEXT:    fcvtzu x9, h2
-; CHECK-NEXT:    fcvtzu x11, h3
-; CHECK-NEXT:    fmov d2, x10
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
-; CHECK-NEXT:    fmov d1, x11
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.h
+; CHECK-NEXT:    fcvtzu z2.d, p0/m, z2.h
+; CHECK-NEXT:    fcvtzu z3.d, p0/m, z3.h
 ; CHECK-NEXT:    zip1 z1.d, z2.d, z1.d
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    zip1 z0.d, z0.d, z3.d
+; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i64:
@@ -522,36 +520,29 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v8f16_v8i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov z2.h, z0.h[3]
 ; CHECK-NEXT:    mov z3.h, z0.h[2]
 ; CHECK-NEXT:    mov z4.h, z0.h[1]
-; CHECK-NEXT:    fcvtzu x10, h0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
-; CHECK-NEXT:    fcvtzu x8, h2
-; CHECK-NEXT:    fcvtzu x9, h3
-; CHECK-NEXT:    fcvtzu x11, h4
+; CHECK-NEXT:    fcvtzu z2.d, p0/m, z2.h
+; CHECK-NEXT:    fcvtzu z3.d, p0/m, z3.h
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT:    fcvtzu z4.d, p0/m, z4.h
 ; CHECK-NEXT:    mov z5.h, z1.h[3]
 ; CHECK-NEXT:    mov z6.h, z1.h[2]
-; CHECK-NEXT:    mov z2.h, z1.h[1]
-; CHECK-NEXT:    fcvtzu x14, h1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    fmov d3, x11
-; CHECK-NEXT:    fcvtzu x12, h5
-; CHECK-NEXT:    fcvtzu x13, h6
-; CHECK-NEXT:    fcvtzu x15, h2
-; CHECK-NEXT:    fmov d2, x10
-; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
-; CHECK-NEXT:    fmov d1, x12
-; CHECK-NEXT:    fmov d4, x13
-; CHECK-NEXT:    zip1 z2.d, z2.d, z3.d
-; CHECK-NEXT:    fmov d3, x14
-; CHECK-NEXT:    zip1 z1.d, z4.d, z1.d
-; CHECK-NEXT:    fmov d4, x15
-; CHECK-NEXT:    stp q2, q0, [x1]
-; CHECK-NEXT:    zip1 z3.d, z3.d, z4.d
-; CHECK-NEXT:    stp q3, q1, [x1, #32]
+; CHECK-NEXT:    mov z7.h, z1.h[1]
+; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.h
+; CHECK-NEXT:    zip1 z2.d, z3.d, z2.d
+; CHECK-NEXT:    zip1 z0.d, z0.d, z4.d
+; CHECK-NEXT:    fcvtzu z5.d, p0/m, z5.h
+; CHECK-NEXT:    fcvtzu z6.d, p0/m, z6.h
+; CHECK-NEXT:    fcvtzu z7.d, p0/m, z7.h
+; CHECK-NEXT:    stp q0, q2, [x1]
+; CHECK-NEXT:    zip1 z3.d, z6.d, z5.d
+; CHECK-NEXT:    zip1 z1.d, z1.d, z7.d
+; CHECK-NEXT:    stp q1, q3, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i64:
@@ -604,67 +595,54 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) {
 define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v16f16_v16i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mov z2.h, z0.h[3]
-; CHECK-NEXT:    mov z4.h, z1.h[1]
-; CHECK-NEXT:    mov z6.h, z1.h[3]
-; CHECK-NEXT:    fcvtzu x9, h1
-; CHECK-NEXT:    fcvtzu x8, h0
-; CHECK-NEXT:    mov z7.h, z0.h[1]
-; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT:    ext z5.b, z5.b, z1.b, #8
-; CHECK-NEXT:    fcvtzu x10, h2
-; CHECK-NEXT:    fcvtzu x11, h4
-; CHECK-NEXT:    fcvtzu x12, h6
-; CHECK-NEXT:    mov z1.h, z1.h[2]
-; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    fmov d16, x9
-; CHECK-NEXT:    mov z2.h, z3.h[3]
-; CHECK-NEXT:    mov z4.h, z5.h[3]
-; CHECK-NEXT:    fcvtzu x14, h3
-; CHECK-NEXT:    fcvtzu x13, h1
-; CHECK-NEXT:    fcvtzu x15, h5
-; CHECK-NEXT:    mov z1.h, z3.h[1]
-; CHECK-NEXT:    mov z6.h, z5.h[1]
-; CHECK-NEXT:    mov z5.h, z5.h[2]
-; CHECK-NEXT:    mov z3.h, z3.h[2]
-; CHECK-NEXT:    fcvtzu x9, h2
-; CHECK-NEXT:    fmov d2, x10
-; CHECK-NEXT:    fcvtzu x10, h4
-; CHECK-NEXT:    fmov d4, x11
-; CHECK-NEXT:    fcvtzu x11, h7
-; CHECK-NEXT:    fmov d7, x12
-; CHECK-NEXT:    fcvtzu x12, h0
-; CHECK-NEXT:    fmov d0, x13
-; CHECK-NEXT:    fcvtzu x13, h1
-; CHECK-NEXT:    fmov d1, x14
-; CHECK-NEXT:    fcvtzu x14, h6
-; CHECK-NEXT:    fmov d6, x15
-; CHECK-NEXT:    fcvtzu x15, h5
-; CHECK-NEXT:    fmov d5, x9
-; CHECK-NEXT:    fcvtzu x9, h3
-; CHECK-NEXT:    zip1 z4.d, z16.d, z4.d
-; CHECK-NEXT:    fmov d16, x8
-; CHECK-NEXT:    zip1 z0.d, z0.d, z7.d
-; CHECK-NEXT:    fmov d3, x12
-; CHECK-NEXT:    fmov d7, x10
-; CHECK-NEXT:    stp q4, q0, [x1, #64]
-; CHECK-NEXT:    fmov d0, x14
-; CHECK-NEXT:    fmov d4, x9
-; CHECK-NEXT:    zip1 z2.d, z3.d, z2.d
-; CHECK-NEXT:    fmov d3, x11
-; CHECK-NEXT:    zip1 z0.d, z6.d, z0.d
-; CHECK-NEXT:    zip1 z4.d, z4.d, z5.d
-; CHECK-NEXT:    zip1 z3.d, z16.d, z3.d
-; CHECK-NEXT:    fmov d16, x15
-; CHECK-NEXT:    stp q3, q2, [x1]
-; CHECK-NEXT:    fmov d2, x13
-; CHECK-NEXT:    zip1 z7.d, z16.d, z7.d
-; CHECK-NEXT:    zip1 z1.d, z1.d, z2.d
-; CHECK-NEXT:    stp q0, q7, [x1, #96]
-; CHECK-NEXT:    stp q1, q4, [x1, #32]
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z3.h, z1.h[1]
+; CHECK-NEXT:    mov z5.h, z0.h[3]
+; CHECK-NEXT:    mov z6.h, z0.h[2]
+; CHECK-NEXT:    mov z16.d, z0.d
+; CHECK-NEXT:    movprfx z2, z1
+; CHECK-NEXT:    fcvtzu z2.d, p0/m, z1.h
+; CHECK-NEXT:    mov z4.h, z1.h[3]
+; CHECK-NEXT:    mov z7.h, z1.h[2]
+; CHECK-NEXT:    mov z17.h, z0.h[1]
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    fcvtzu z3.d, p0/m, z3.h
+; CHECK-NEXT:    fcvtzu z5.d, p0/m, z5.h
+; CHECK-NEXT:    fcvtzu z6.d, p0/m, z6.h
+; CHECK-NEXT:    ext z16.b, z16.b, z0.b, #8
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT:    fcvtzu z4.d, p0/m, z4.h
+; CHECK-NEXT:    fcvtzu z17.d, p0/m, z17.h
+; CHECK-NEXT:    fcvtzu z7.d, p0/m, z7.h
+; CHECK-NEXT:    mov z20.h, z1.h[3]
+; CHECK-NEXT:    mov z18.h, z16.h[3]
+; CHECK-NEXT:    mov z19.h, z16.h[2]
+; CHECK-NEXT:    mov z21.h, z16.h[1]
+; CHECK-NEXT:    zip1 z2.d, z2.d, z3.d
+; CHECK-NEXT:    mov z3.h, z1.h[2]
+; CHECK-NEXT:    zip1 z5.d, z6.d, z5.d
+; CHECK-NEXT:    mov z6.h, z1.h[1]
+; CHECK-NEXT:    zip1 z0.d, z0.d, z17.d
+; CHECK-NEXT:    fcvtzu z16.d, p0/m, z16.h
+; CHECK-NEXT:    fcvtzu z18.d, p0/m, z18.h
+; CHECK-NEXT:    movprfx z17, z21
+; CHECK-NEXT:    fcvtzu z17.d, p0/m, z21.h
+; CHECK-NEXT:    fcvtzu z19.d, p0/m, z19.h
+; CHECK-NEXT:    zip1 z4.d, z7.d, z4.d
+; CHECK-NEXT:    movprfx z7, z20
+; CHECK-NEXT:    fcvtzu z7.d, p0/m, z20.h
+; CHECK-NEXT:    fcvtzu z3.d, p0/m, z3.h
+; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.h
+; CHECK-NEXT:    stp q0, q5, [x1, #64]
+; CHECK-NEXT:    fcvtzu z6.d, p0/m, z6.h
+; CHECK-NEXT:    zip1 z0.d, z19.d, z18.d
+; CHECK-NEXT:    zip1 z5.d, z16.d, z17.d
+; CHECK-NEXT:    stp q2, q4, [x1]
+; CHECK-NEXT:    zip1 z2.d, z3.d, z7.d
+; CHECK-NEXT:    zip1 z1.d, z1.d, z6.d
+; CHECK-NEXT:    stp q5, q0, [x1, #96]
+; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i64:
@@ -1186,7 +1164,10 @@ define void @fcvtzu_v8f32_v8i64(ptr %a, ptr %b) {
 define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) {
 ; CHECK-LABEL: fcvtzu_v1f64_v1i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvtzs w8, d0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.d
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z0.h, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -2135,8 +2116,10 @@ define void @fcvtzs_v16f16_v16i32(ptr %a, ptr %b) {
 define <1 x i64> @fcvtzs_v1f16_v1i64(<1 x half> %op1) {
 ; CHECK-LABEL: fcvtzs_v1f16_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvtzs x8, h0
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v1f16_v1i64:
@@ -2159,10 +2142,9 @@ define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z1.h, z0.h[1]
-; CHECK-NEXT:    fcvtzs x8, h0
-; CHECK-NEXT:    fcvtzs x9, h1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.h
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -2190,20 +2172,17 @@ define void @fcvtzs_v4f16_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v4f16_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.h, z0.h[3]
 ; CHECK-NEXT:    mov z2.h, z0.h[2]
 ; CHECK-NEXT:    mov z3.h, z0.h[1]
-; CHECK-NEXT:    fcvtzs x10, h0
-; CHECK-NEXT:    fcvtzs x8, h1
-; CHECK-NEXT:    fcvtzs x9, h2
-; CHECK-NEXT:    fcvtzs x11, h3
-; CHECK-NEXT:    fmov d2, x10
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
-; CHECK-NEXT:    fmov d1, x11
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.h
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.h
+; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.h
 ; CHECK-NEXT:    zip1 z1.d, z2.d, z1.d
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    zip1 z0.d, z0.d, z3.d
+; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i64:
@@ -2240,36 +2219,29 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v8f16_v8i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov z2.h, z0.h[3]
 ; CHECK-NEXT:    mov z3.h, z0.h[2]
 ; CHECK-NEXT:    mov z4.h, z0.h[1]
-; CHECK-NEXT:    fcvtzs x10, h0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
-; CHECK-NEXT:    fcvtzs x8, h2
-; CHECK-NEXT:    fcvtzs x9, h3
-; CHECK-NEXT:    fcvtzs x11, h4
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.h
+; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.h
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.h
 ; CHECK-NEXT:    mov z5.h, z1.h[3]
 ; CHECK-NEXT:    mov z6.h, z1.h[2]
-; CHECK-NEXT:    mov z2.h, z1.h[1]
-; CHECK-NEXT:    fcvtzs x14, h1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    fmov d3, x11
-; CHECK-NEXT:    fcvtzs x12, h5
-; CHECK-NEXT:    fcvtzs x13, h6
-; CHECK-NEXT:    fcvtzs x15, h2
-; CHECK-NEXT:    fmov d2, x10
-; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
-; CHECK-NEXT:    fmov d1, x12
-; CHECK-NEXT:    fmov d4, x13
-; CHECK-NEXT:    zip1 z2.d, z2.d, z3.d
-; CHECK-NEXT:    fmov d3, x14
-; CHECK-NEXT:    zip1 z1.d, z4.d, z1.d
-; CHECK-NEXT:    fmov d4, x15
-; CHECK-NEXT:    stp q2, q0, [x1]
-; CHECK-NEXT:    zip1 z3.d, z3.d, z4.d
-; CHECK-NEXT:    stp q3, q1, [x1, #32]
+; CHECK-NEXT:    mov z7.h, z1.h[1]
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.h
+; CHECK-NEXT:    zip1 z2.d, z3.d, z2.d
+; CHECK-NEXT:    zip1 z0.d, z0.d, z4.d
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z5.h
+; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.h
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z7.h
+; CHECK-NEXT:    stp q0, q2, [x1]
+; CHECK-NEXT:    zip1 z3.d, z6.d, z5.d
+; CHECK-NEXT:    zip1 z1.d, z1.d, z7.d
+; CHECK-NEXT:    stp q1, q3, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i64:
@@ -2322,67 +2294,54 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) {
 define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v16f16_v16i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mov z2.h, z0.h[3]
-; CHECK-NEXT:    mov z4.h, z1.h[1]
-; CHECK-NEXT:    mov z6.h, z1.h[3]
-; CHECK-NEXT:    fcvtzs x9, h1
-; CHECK-NEXT:    fcvtzs x8, h0
-; CHECK-NEXT:    mov z7.h, z0.h[1]
-; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT:    ext z5.b, z5.b, z1.b, #8
-; CHECK-NEXT:    fcvtzs x10, h2
-; CHECK-NEXT:    fcvtzs x11, h4
-; CHECK-NEXT:    fcvtzs x12, h6
-; CHECK-NEXT:    mov z1.h, z1.h[2]
-; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    fmov d16, x9
-; CHECK-NEXT:    mov z2.h, z3.h[3]
-; CHECK-NEXT:    mov z4.h, z5.h[3]
-; CHECK-NEXT:    fcvtzs x14, h3
-; CHECK-NEXT:    fcvtzs x13, h1
-; CHECK-NEXT:    fcvtzs x15, h5
-; CHECK-NEXT:    mov z1.h, z3.h[1]
-; CHECK-NEXT:    mov z6.h, z5.h[1]
-; CHECK-NEXT:    mov z5.h, z5.h[2]
-; CHECK-NEXT:    mov z3.h, z3.h[2]
-; CHECK-NEXT:    fcvtzs x9, h2
-; CHECK-NEXT:    fmov d2, x10
-; CHECK-NEXT:    fcvtzs x10, h4
-; CHECK-NEXT:    fmov d4, x11
-; CHECK-NEXT:    fcvtzs x11, h7
-; CHECK-NEXT:    fmov d7, x12
-; CHECK-NEXT:    fcvtzs x12, h0
-; CHECK-NEXT:    fmov d0, x13
-; CHECK-NEXT:    fcvtzs x13, h1
-; CHECK-NEXT:    fmov d1, x14
-; CHECK-NEXT:    fcvtzs x14, h6
-; CHECK-NEXT:    fmov d6, x15
-; CHECK-NEXT:    fcvtzs x15, h5
-; CHECK-NEXT:    fmov d5, x9
-; CHECK-NEXT:    fcvtzs x9, h3
-; CHECK-NEXT:    zip1 z4.d, z16.d, z4.d
-; CHECK-NEXT:    fmov d16, x8
-; CHECK-NEXT:    zip1 z0.d, z0.d, z7.d
-; CHECK-NEXT:    fmov d3, x12
-; CHECK-NEXT:    fmov d7, x10
-; CHECK-NEXT:    stp q4, q0, [x1, #64]
-; CHECK-NEXT:    fmov d0, x14
-; CHECK-NEXT:    fmov d4, x9
-; CHECK-NEXT:    zip1 z2.d, z3.d, z2.d
-; CHECK-NEXT:    fmov d3, x11
-; CHECK-NEXT:    zip1 z0.d, z6.d, z0.d
-; CHECK-NEXT:    zip1 z4.d, z4.d, z5.d
-; CHECK-NEXT:    zip1 z3.d, z16.d, z3.d
-; CHECK-NEXT:    fmov d16, x15
-; CHECK-NEXT:    stp q3, q2, [x1]
-; CHECK-NEXT:    fmov d2, x13
-; CHECK-NEXT:    zip1 z7.d, z16.d, z7.d
-; CHECK-NEXT:    zip1 z1.d, z1.d, z2.d
-; CHECK-NEXT:    stp q0, q7, [x1, #96]
-; CHECK-NEXT:    stp q1, q4, [x1, #32]
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z3.h, z1.h[1]
+; CHECK-NEXT:    mov z5.h, z0.h[3]
+; CHECK-NEXT:    mov z6.h, z0.h[2]
+; CHECK-NEXT:    mov z16.d, z0.d
+; CHECK-NEXT:    movprfx z2, z1
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z1.h
+; CHECK-NEXT:    mov z4.h, z1.h[3]
+; CHECK-NEXT:    mov z7.h, z1.h[2]
+; CHECK-NEXT:    mov z17.h, z0.h[1]
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.h
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z5.h
+; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.h
+; CHECK-NEXT:    ext z16.b, z16.b, z0.b, #8
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.h
+; CHECK-NEXT:    fcvtzs z17.d, p0/m, z17.h
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z7.h
+; CHECK-NEXT:    mov z20.h, z1.h[3]
+; CHECK-NEXT:    mov z18.h, z16.h[3]
+; CHECK-NEXT:    mov z19.h, z16.h[2]
+; CHECK-NEXT:    mov z21.h, z16.h[1]
+; CHECK-NEXT:    zip1 z2.d, z2.d, z3.d
+; CHECK-NEXT:    mov z3.h, z1.h[2]
+; CHECK-NEXT:    zip1 z5.d, z6.d, z5.d
+; CHECK-NEXT:    mov z6.h, z1.h[1]
+; CHECK-NEXT:    zip1 z0.d, z0.d, z17.d
+; CHECK-NEXT:    fcvtzs z16.d, p0/m, z16.h
+; CHECK-NEXT:    fcvtzs z18.d, p0/m, z18.h
+; CHECK-NEXT:    movprfx z17, z21
+; CHECK-NEXT:    fcvtzs z17.d, p0/m, z21.h
+; CHECK-NEXT:    fcvtzs z19.d, p0/m, z19.h
+; CHECK-NEXT:    zip1 z4.d, z7.d, z4.d
+; CHECK-NEXT:    movprfx z7, z20
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z20.h
+; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.h
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.h
+; CHECK-NEXT:    stp q0, q5, [x1, #64]
+; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.h
+; CHECK-NEXT:    zip1 z0.d, z19.d, z18.d
+; CHECK-NEXT:    zip1 z5.d, z16.d, z17.d
+; CHECK-NEXT:    stp q2, q4, [x1]
+; CHECK-NEXT:    zip1 z2.d, z3.d, z7.d
+; CHECK-NEXT:    zip1 z1.d, z1.d, z6.d
+; CHECK-NEXT:    stp q5, q0, [x1, #96]
+; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i64:
@@ -2906,7 +2865,10 @@ define void @fcvtzs_v8f32_v8i64(ptr %a, ptr %b) {
 define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) {
 ; CHECK-LABEL: fcvtzs_v1f64_v1i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvtzs w8, d0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.d
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z0.h, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index e595686cb4975d..24ad0f502dbf33 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -390,8 +390,11 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and w8, w8, #0xffff
-; CHECK-NEXT:    ucvtf d0, w8
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v1i16_v1f64:
@@ -1142,10 +1145,9 @@ define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    mov z1.d, z0.d[1]
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    fmov x9, d1
-; CHECK-NEXT:    ucvtf h0, x8
-; CHECK-NEXT:    ucvtf h1, x9
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.d
+; CHECK-NEXT:    ucvtf z1.h, p0/m, z1.d
 ; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -2596,10 +2598,9 @@ define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    mov z1.d, z0.d[1]
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    fmov x9, d1
-; CHECK-NEXT:    scvtf h0, x8
-; CHECK-NEXT:    scvtf h1, x9
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.d
+; CHECK-NEXT:    scvtf z1.h, p0/m, z1.d
 ; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -2795,7 +2796,10 @@ define half @scvtf_i16_f16(ptr %0) {
 ; CHECK-LABEL: scvtf_i16_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrsh w8, [x0]
-; CHECK-NEXT:    scvtf h0, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_i16_f16:
@@ -2813,7 +2817,10 @@ define float @scvtf_i16_f32(ptr %0) {
 ; CHECK-LABEL: scvtf_i16_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrsh w8, [x0]
-; CHECK-NEXT:    scvtf s0, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_i16_f32:
@@ -2830,7 +2837,10 @@ define double @scvtf_i16_f64(ptr %0) {
 ; CHECK-LABEL: scvtf_i16_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrsh w8, [x0]
-; CHECK-NEXT:    scvtf d0, w8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_i16_f64:
@@ -2846,8 +2856,10 @@ define double @scvtf_i16_f64(ptr %0) {
 define half @scvtf_i32_f16(ptr %0) {
 ; CHECK-LABEL: scvtf_i32_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    scvtf h0, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_i32_f16:
@@ -2864,8 +2876,10 @@ define half @scvtf_i32_f16(ptr %0) {
 define float @scvtf_i32_f32(ptr %0) {
 ; CHECK-LABEL: scvtf_i32_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    scvtf s0, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_i32_f32:
@@ -2881,8 +2895,10 @@ define float @scvtf_i32_f32(ptr %0) {
 define double @scvtf_i32_f64(ptr %0) {
 ; CHECK-LABEL: scvtf_i32_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    scvtf d0, w8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_i32_f64:
@@ -2898,8 +2914,10 @@ define double @scvtf_i32_f64(ptr %0) {
 define half @scvtf_i64_f16(ptr %0) {
 ; CHECK-LABEL: scvtf_i64_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    scvtf h0, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_i64_f16:
@@ -2916,8 +2934,10 @@ define half @scvtf_i64_f16(ptr %0) {
 define float @scvtf_i64_f32(ptr %0) {
 ; CHECK-LABEL: scvtf_i64_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_i64_f32:
@@ -2933,8 +2953,10 @@ define float @scvtf_i64_f32(ptr %0) {
 define double @scvtf_i64_f64(ptr %0) {
 ; CHECK-LABEL: scvtf_i64_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    scvtf d0, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_i64_f64:
@@ -2951,7 +2973,10 @@ define half @ucvtf_i16_f16(ptr %0) {
 ; CHECK-LABEL: ucvtf_i16_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ucvtf h0, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i16_f16:
@@ -2969,7 +2994,10 @@ define float @ucvtf_i16_f32(ptr %0) {
 ; CHECK-LABEL: ucvtf_i16_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ucvtf s0, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i16_f32:
@@ -2986,7 +3014,10 @@ define double @ucvtf_i16_f64(ptr %0) {
 ; CHECK-LABEL: ucvtf_i16_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ucvtf d0, w8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i16_f64:
@@ -3002,8 +3033,10 @@ define double @ucvtf_i16_f64(ptr %0) {
 define half @ucvtf_i32_f16(ptr %0) {
 ; CHECK-LABEL: ucvtf_i32_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ucvtf h0, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i32_f16:
@@ -3020,8 +3053,10 @@ define half @ucvtf_i32_f16(ptr %0) {
 define float @ucvtf_i32_f32(ptr %0) {
 ; CHECK-LABEL: ucvtf_i32_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ucvtf s0, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i32_f32:
@@ -3037,8 +3072,10 @@ define float @ucvtf_i32_f32(ptr %0) {
 define double @ucvtf_i32_f64(ptr %0) {
 ; CHECK-LABEL: ucvtf_i32_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ucvtf d0, w8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i32_f64:
@@ -3054,8 +3091,10 @@ define double @ucvtf_i32_f64(ptr %0) {
 define half @ucvtf_i64_f16(ptr %0) {
 ; CHECK-LABEL: ucvtf_i64_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ucvtf h0, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i64_f16:
@@ -3072,8 +3111,10 @@ define half @ucvtf_i64_f16(ptr %0) {
 define float @ucvtf_i64_f32(ptr %0) {
 ; CHECK-LABEL: ucvtf_i64_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ucvtf s0, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i64_f32:
@@ -3089,8 +3130,10 @@ define float @ucvtf_i64_f32(ptr %0) {
 define double @ucvtf_i64_f64(ptr %0) {
 ; CHECK-LABEL: ucvtf_i64_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ucvtf d0, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i64_f64:

>From f59876f5258b3b3bbcddb7d9477e325cc5408151 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 23 Oct 2024 09:06:18 +0000
Subject: [PATCH 4/8] Remove strict converts

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 37 ++++---------------
 .../sve-streaming-mode-cvt-fp-to-int.ll       | 10 +----
 .../sve-streaming-mode-cvt-int-to-fp.ll       | 12 ++----
 3 files changed, 13 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ab329a6dc79080..7e8153534076d3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1454,12 +1454,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
       setOperationAction(ISD::UINT_TO_FP, VT, Custom);
       setOperationAction(ISD::SINT_TO_FP, VT, Custom);
-      setOperationAction(ISD::STRICT_UINT_TO_FP, VT, Custom);
-      setOperationAction(ISD::STRICT_SINT_TO_FP, VT, Custom);
       setOperationAction(ISD::FP_TO_UINT, VT, Custom);
       setOperationAction(ISD::FP_TO_SINT, VT, Custom);
-      setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
-      setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
       setOperationAction(ISD::MLOAD, VT, Custom);
       setOperationAction(ISD::MUL, VT, Custom);
       setOperationAction(ISD::MULHS, VT, Custom);
@@ -2142,8 +2138,6 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
   setOperationAction(ISD::FP_ROUND, VT, Default);
   setOperationAction(ISD::FP_TO_SINT, VT, Default);
   setOperationAction(ISD::FP_TO_UINT, VT, Default);
-  setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Default);
-  setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Default);
   setOperationAction(ISD::FRINT, VT, Default);
   setOperationAction(ISD::LRINT, VT, Default);
   setOperationAction(ISD::LLRINT, VT, Default);
@@ -2170,7 +2164,6 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
   setOperationAction(ISD::SIGN_EXTEND, VT, Default);
   setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Default);
   setOperationAction(ISD::SINT_TO_FP, VT, Default);
-  setOperationAction(ISD::STRICT_SINT_TO_FP, VT, Default);
   setOperationAction(ISD::SMAX, VT, Default);
   setOperationAction(ISD::SMIN, VT, Default);
   setOperationAction(ISD::SPLAT_VECTOR, VT, Default);
@@ -2181,7 +2174,6 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
   setOperationAction(ISD::TRUNCATE, VT, Default);
   setOperationAction(ISD::UDIV, VT, Default);
   setOperationAction(ISD::UINT_TO_FP, VT, Default);
-  setOperationAction(ISD::STRICT_UINT_TO_FP, VT, Default);
   setOperationAction(ISD::UMAX, VT, Default);
   setOperationAction(ISD::UMIN, VT, Default);
   setOperationAction(ISD::VECREDUCE_ADD, VT, Default);
@@ -4649,8 +4641,8 @@ static bool CanLowerToScalarSVEFPIntConversion(EVT VT) {
 
 /// Lowers a scalar FP conversion (to/from) int to SVE.
 static SDValue LowerScalarFPConversionToSVE(SDValue Op, SelectionDAG &DAG) {
-  bool IsStrict = Op->isStrictFPOpcode();
-  SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
+  assert(!Op->isStrictFPOpcode() && "strict fp ops not supported");
+  SDValue SrcVal = Op.getOperand(0);
   EVT SrcTy = SrcVal.getValueType();
   EVT DestTy = Op.getValueType();
   EVT SrcVecTy;
@@ -4672,14 +4664,9 @@ static SDValue LowerScalarFPConversionToSVE(SDValue Op, SelectionDAG &DAG) {
   SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
   SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SrcVecTy,
                             DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
-  Vec = IsStrict ? DAG.getNode(Op.getOpcode(), dl, {DestVecTy, MVT::Other},
-                               {Op.getOperand(0), Vec})
-                 : DAG.getNode(Op.getOpcode(), dl, DestVecTy, Vec);
-  SDValue Scalar =
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, ZeroIdx);
-  if (IsStrict)
-    return DAG.getMergeValues({Scalar, Vec.getValue(1)}, dl);
-  return Scalar;
+  Vec = DAG.getNode(Op.getOpcode(), dl, DestVecTy, Vec);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
+                     ZeroIdx);
 }
 
 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
@@ -4690,7 +4677,7 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
   if (SrcVal.getValueType().isVector())
     return LowerVectorFP_TO_INT(Op, DAG);
 
-  if (!Subtarget->isNeonAvailable() &&
+  if (!IsStrict && !Subtarget->isNeonAvailable() &&
       Subtarget->isSVEorStreamingSVEAvailable() &&
       CanLowerToScalarSVEFPIntConversion(SrcVal.getValueType()) &&
       CanLowerToScalarSVEFPIntConversion(Op.getValueType()))
@@ -4999,7 +4986,7 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
   bool IsStrict = Op->isStrictFPOpcode();
   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
 
-  if (!Subtarget->isNeonAvailable() &&
+  if (!IsStrict && !Subtarget->isNeonAvailable() &&
       Subtarget->isSVEorStreamingSVEAvailable() &&
       CanLowerToScalarSVEFPIntConversion(SrcVal.getValueType()) &&
       CanLowerToScalarSVEFPIntConversion(Op.getValueType()))
@@ -28443,12 +28430,7 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
   assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
 
   SmallVector<SDValue, 4> Operands = {Pg};
-  SDValue Chain{};
   for (const SDValue &V : Op->op_values()) {
-    if (!isa<CondCodeSDNode>(V) && V.getValueType() == MVT::Other) {
-      Chain = V;
-      continue;
-    }
     assert((!V.getValueType().isVector() ||
             V.getValueType().isScalableVector()) &&
            "Only scalable vectors are supported!");
@@ -28458,10 +28440,7 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
   if (isMergePassthruOpcode(NewOp))
     Operands.push_back(DAG.getUNDEF(VT));
 
-  auto NewNode = DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
-  if (Chain)
-    return DAG.getMergeValues({NewNode, Chain}, DL);
-  return NewNode;
+  return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
 }
 
 // If a fixed length vector operation has no side effects when applied to
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll
index 60d3124f5b21e8..300ccefc71c91e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll
@@ -228,10 +228,7 @@ define i64 @f64_to_u64(double %x) {
 define i32 @strict_convert_signed(double %x) {
 ; CHECK-LABEL: strict_convert_signed:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.d
-; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    fcvtzs w0, d0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: strict_convert_signed:
@@ -246,10 +243,7 @@ define i32 @strict_convert_signed(double %x) {
 define i32 @strict_convert_unsigned(float %x) {
 ; CHECK-LABEL: strict_convert_unsigned:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
-; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
-; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    fcvtzu w0, s0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: strict_convert_unsigned:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll
index 42be60ad559705..0fc0d9cda4e637 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
+; RUN: llc -mattr=+sve,+sme -force-streaming < %s | FileCheck %s
 ; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -228,10 +228,7 @@ entry:
 define half @strict_convert_signed(i32 %x) {
 ; CHECK-LABEL: strict_convert_signed:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    scvtf z0.h, p0/m, z0.s
-; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    scvtf h0, w0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: strict_convert_signed:
@@ -247,10 +244,7 @@ entry:
 define float @strict_convert_unsigned(i64 %x) {
 ; CHECK-LABEL: strict_convert_unsigned:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov d0, x0
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.d
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ucvtf s0, x0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: strict_convert_unsigned:

>From 3a5683a9d37f3e8570a8b1481c907e65393be878 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 24 Oct 2024 11:14:40 +0000
Subject: [PATCH 5/8] Move to DAGCombine + fixups

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 116 +++++++++---------
 .../sve-streaming-mode-cvt-fp-int-fp.ll       |  48 +++++++-
 .../sve-streaming-mode-cvt-int-to-fp.ll       |  11 +-
 3 files changed, 111 insertions(+), 64 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7e8153534076d3..1aaf8a79218d4c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4550,10 +4550,9 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
   EVT VT = Op.getValueType();
 
   if (VT.isScalableVector()) {
-    unsigned Opc = Op.getOpcode();
-    bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT;
-    unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
-                               : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
+    unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
+                          ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
+                          : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
     return LowerToPredicatedOp(Op, DAG, Opcode);
   }
 
@@ -4629,46 +4628,6 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
   return Op;
 }
 
-static bool CanLowerToScalarSVEFPIntConversion(EVT VT) {
-  if (!VT.isSimple())
-    return false;
-  // There are SVE instructions that can convert to/from all pairs of these int
-  // and float types. Note: We don't bother with i8 or i16 as those are illegal
-  // types for scalars.
-  return is_contained({MVT::i32, MVT::i64, MVT::f16, MVT::f32, MVT::f64},
-                      VT.getSimpleVT().SimpleTy);
-}
-
-/// Lowers a scalar FP conversion (to/from) int to SVE.
-static SDValue LowerScalarFPConversionToSVE(SDValue Op, SelectionDAG &DAG) {
-  assert(!Op->isStrictFPOpcode() && "strict fp ops not supported");
-  SDValue SrcVal = Op.getOperand(0);
-  EVT SrcTy = SrcVal.getValueType();
-  EVT DestTy = Op.getValueType();
-  EVT SrcVecTy;
-  EVT DestVecTy;
-  // Use a packed vector for the larger type.
-  // Note: For conversions such as FCVTZS_ZPmZ_DtoS, and UCVTF_ZPmZ_StoD that
-  // notionally take or return a nxv2i32 type we must instead use a nxv4i32, as
-  // (unlike floats) nxv2i32 is an illegal unpacked type.
-  if (DestTy.bitsGT(SrcTy)) {
-    DestVecTy = getPackedSVEVectorVT(DestTy);
-    SrcVecTy = SrcTy == MVT::i32 ? getPackedSVEVectorVT(SrcTy)
-                                 : DestVecTy.changeVectorElementType(SrcTy);
-  } else {
-    SrcVecTy = getPackedSVEVectorVT(SrcTy);
-    DestVecTy = DestTy == MVT::i32 ? getPackedSVEVectorVT(DestTy)
-                                   : SrcVecTy.changeVectorElementType(DestTy);
-  }
-  SDLoc dl(Op);
-  SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
-  SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SrcVecTy,
-                            DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
-  Vec = DAG.getNode(Op.getOpcode(), dl, DestVecTy, Vec);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
-                     ZeroIdx);
-}
-
 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
                                               SelectionDAG &DAG) const {
   bool IsStrict = Op->isStrictFPOpcode();
@@ -4677,12 +4636,6 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
   if (SrcVal.getValueType().isVector())
     return LowerVectorFP_TO_INT(Op, DAG);
 
-  if (!IsStrict && !Subtarget->isNeonAvailable() &&
-      Subtarget->isSVEorStreamingSVEAvailable() &&
-      CanLowerToScalarSVEFPIntConversion(SrcVal.getValueType()) &&
-      CanLowerToScalarSVEFPIntConversion(Op.getValueType()))
-    return LowerScalarFPConversionToSVE(Op, DAG);
-
   // f16 conversions are promoted to f32 when full fp16 is not supported.
   if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
       SrcVal.getValueType() == MVT::bf16) {
@@ -4986,12 +4939,6 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
   bool IsStrict = Op->isStrictFPOpcode();
   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
 
-  if (!IsStrict && !Subtarget->isNeonAvailable() &&
-      Subtarget->isSVEorStreamingSVEAvailable() &&
-      CanLowerToScalarSVEFPIntConversion(SrcVal.getValueType()) &&
-      CanLowerToScalarSVEFPIntConversion(Op.getValueType()))
-    return LowerScalarFPConversionToSVE(Op, DAG);
-
   bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
                   Op->getOpcode() == ISD::SINT_TO_FP;
 
@@ -19014,6 +18961,57 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
   return SDValue();
 }
 
+static bool
+shouldUseSVEForScalarFPConversion(SDNode *N,
+                                  const AArch64Subtarget *Subtarget) {
+  auto isSupportedType = [](EVT VT) {
+    if (!VT.isSimple())
+      return false;
+    // There are SVE instructions that can convert to/from all pairs of these
+    // int and float types. Note: We don't bother with i8 or i16 as those are
+    // illegal types for scalars.
+    return is_contained({MVT::i32, MVT::i64, MVT::f16, MVT::f32, MVT::f64},
+                        VT.getSimpleVT().SimpleTy);
+  };
+  // If we are in a streaming[-compatible] function, use SVE for scalar FP <->
+  // INT conversions as this can help avoid movs between GPRs and FPRs, which
+  // could be quite expensive.
+  return !N->isStrictFPOpcode() && Subtarget->isSVEorStreamingSVEAvailable() &&
+         (Subtarget->isStreaming() || Subtarget->isStreamingCompatible()) &&
+         isSupportedType(N->getValueType(0)) &&
+         isSupportedType(N->getOperand(0).getValueType());
+}
+
+/// Replaces a scalar FP <-> INT conversion with an SVE (scalable) one, wrapped
+/// with an insert and extract.
+static SDValue replaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG) {
+  assert(!N->isStrictFPOpcode() && "strict fp ops not supported");
+  SDValue SrcVal = N->getOperand(0);
+  EVT SrcTy = SrcVal.getValueType();
+  EVT DestTy = N->getValueType(0);
+  EVT SrcVecTy;
+  EVT DestVecTy;
+  // Use a packed vector for the larger type.
+  // Note: For conversions such as FCVTZS_ZPmZ_DtoS, and UCVTF_ZPmZ_StoD that
+  // notionally take or return a nxv2i32 type we must instead use a nxv4i32, as
+  // (unlike floats) nxv2i32 is an illegal unpacked type.
+  if (DestTy.bitsGT(SrcTy)) {
+    DestVecTy = getPackedSVEVectorVT(DestTy);
+    SrcVecTy = SrcTy == MVT::i32 ? getPackedSVEVectorVT(SrcTy)
+                                 : DestVecTy.changeVectorElementType(SrcTy);
+  } else {
+    SrcVecTy = getPackedSVEVectorVT(SrcTy);
+    DestVecTy = DestTy == MVT::i32 ? getPackedSVEVectorVT(DestTy)
+                                   : SrcVecTy.changeVectorElementType(DestTy);
+  }
+  SDLoc dl(N);
+  SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
+  SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SrcVecTy,
+                            DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
+  Vec = DAG.getNode(N->getOpcode(), dl, DestVecTy, Vec);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DestTy, Vec, ZeroIdx);
+}
+
 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
                                      const AArch64Subtarget *Subtarget) {
   // First try to optimize away the conversion when it's conditionally from
@@ -19021,6 +19019,9 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
   if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
     return Res;
 
+  if (shouldUseSVEForScalarFPConversion(N, Subtarget))
+    return replaceScalarFPConversionWithSVE(N, DAG);
+
   EVT VT = N->getValueType(0);
   if (VT != MVT::f32 && VT != MVT::f64)
     return SDValue();
@@ -19059,6 +19060,9 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const AArch64Subtarget *Subtarget) {
+  if (shouldUseSVEForScalarFPConversion(N, Subtarget))
+    return replaceScalarFPConversionWithSVE(N, DAG);
+
   if (!Subtarget->isNeonAvailable())
     return SDValue();
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
index 0f4cb2060f2498..1050dc0210a67e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
@@ -1,7 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -force-streaming-compatible  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible -mattr=+sme2p2  < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
-; RUN: llc < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
+; RUN: llc -mattr=+sme2p2 -force-streaming-compatible < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
+; RUN: llc -mattr=+neon < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -20,6 +22,12 @@ define double @t1(double %x) {
 ; USE-NEON-NO-GPRS-NEXT:    fcvtzs d0, d0
 ; USE-NEON-NO-GPRS-NEXT:    scvtf d0, d0
 ; USE-NEON-NO-GPRS-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: t1:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzs x8, d0
+; NONEON-NOSVE-NEXT:    scvtf d0, x8
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %conv = fptosi double %x to i64
   %conv1 = sitofp i64 %conv to double
@@ -41,6 +49,12 @@ define float @t2(float %x) {
 ; USE-NEON-NO-GPRS-NEXT:    fcvtzs s0, s0
 ; USE-NEON-NO-GPRS-NEXT:    scvtf s0, s0
 ; USE-NEON-NO-GPRS-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: t2:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %conv = fptosi float %x to i32
   %conv1 = sitofp i32 %conv to float
@@ -64,6 +78,14 @@ define half @t3(half %x)  {
 ; USE-NEON-NO-GPRS-NEXT:    scvtf s0, s0
 ; USE-NEON-NO-GPRS-NEXT:    fcvt h0, s0
 ; USE-NEON-NO-GPRS-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: t3:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %conv = fptosi half %x to i32
   %conv1 = sitofp i32 %conv to half
@@ -85,6 +107,12 @@ define double @t4(double %x) {
 ; USE-NEON-NO-GPRS-NEXT:    fcvtzu d0, d0
 ; USE-NEON-NO-GPRS-NEXT:    ucvtf d0, d0
 ; USE-NEON-NO-GPRS-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: t4:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzu x8, d0
+; NONEON-NOSVE-NEXT:    ucvtf d0, x8
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %conv = fptoui double %x to i64
   %conv1 = uitofp i64 %conv to double
@@ -106,6 +134,12 @@ define float @t5(float %x) {
 ; USE-NEON-NO-GPRS-NEXT:    fcvtzu s0, s0
 ; USE-NEON-NO-GPRS-NEXT:    ucvtf s0, s0
 ; USE-NEON-NO-GPRS-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: t5:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %conv = fptoui float %x to i32
   %conv1 = uitofp i32 %conv to float
@@ -129,6 +163,14 @@ define half @t6(half %x)  {
 ; USE-NEON-NO-GPRS-NEXT:    ucvtf s0, s0
 ; USE-NEON-NO-GPRS-NEXT:    fcvt h0, s0
 ; USE-NEON-NO-GPRS-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: t6:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %conv = fptoui half %x to i32
   %conv1 = uitofp i32 %conv to half
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll
index 0fc0d9cda4e637..61049478850c00 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
-; RUN: llc -mattr=+sve,+sme -force-streaming < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,FORCE-STREAMING
 ; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -226,10 +226,11 @@ entry:
 }
 
 define half @strict_convert_signed(i32 %x) {
-; CHECK-LABEL: strict_convert_signed:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf h0, w0
-; CHECK-NEXT:    ret
+; FORCE-STREAMING-LABEL: strict_convert_signed:
+; FORCE-STREAMING:       // %bb.0: // %entry
+; FORCE-STREAMING-NEXT:    scvtf s0, w0
+; FORCE-STREAMING-NEXT:    fcvt h0, s0
+; FORCE-STREAMING-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: strict_convert_signed:
 ; NONEON-NOSVE:       // %bb.0: // %entry

>From 8a37e17d6a0f86079c521f344d6cf392cdf8993c Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 24 Oct 2024 11:25:38 +0000
Subject: [PATCH 6/8] Tweak test

---
 .../sve-streaming-mode-cvt-int-to-fp.ll        | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll
index 61049478850c00..0a00ce69587b0d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,FORCE-STREAMING
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK
 ; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -225,21 +225,19 @@ entry:
   ret double %cvt
 }
 
-define half @strict_convert_signed(i32 %x) {
-; FORCE-STREAMING-LABEL: strict_convert_signed:
-; FORCE-STREAMING:       // %bb.0: // %entry
-; FORCE-STREAMING-NEXT:    scvtf s0, w0
-; FORCE-STREAMING-NEXT:    fcvt h0, s0
-; FORCE-STREAMING-NEXT:    ret
+define float @strict_convert_signed(i32 %x) {
+; CHECK-LABEL: strict_convert_signed:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    scvtf s0, w0
+; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: strict_convert_signed:
 ; NONEON-NOSVE:       // %bb.0: // %entry
 ; NONEON-NOSVE-NEXT:    scvtf s0, w0
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    ret
 entry:
-  %cvt = call half @llvm.experimental.constrained.sitofp.f16.i32(i32 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
-  ret half %cvt
+  %cvt = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %cvt
 }
 
 define float @strict_convert_unsigned(i64 %x) {

>From 64335db5b4c70dfa7639f0626a167d506486d2c2 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 24 Oct 2024 15:14:37 +0000
Subject: [PATCH 7/8] WIP

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 120 ++++++++++--------
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |   8 +-
 .../sve-streaming-mode-cvt-fp-to-int.ll       |  10 +-
 .../sve-streaming-mode-cvt-int-to-fp.ll       |  11 +-
 ...e-streaming-mode-fixed-length-fp-to-int.ll |   4 +-
 ...e-streaming-mode-fixed-length-int-to-fp.ll |  24 ++--
 6 files changed, 99 insertions(+), 78 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1aaf8a79218d4c..f29605f62ee6f8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18961,9 +18961,39 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
   return SDValue();
 }
 
-static bool
-shouldUseSVEForScalarFPConversion(SDNode *N,
-                                  const AArch64Subtarget *Subtarget) {
+/// Creates a scalar FP <-> INT conversion with a scalable one, wrapped
+/// with an insert and extract.
+static SDValue createScalarSVEFPConversion(SelectionDAG &DAG, unsigned Opc,
+                                           SDLoc DL, SDValue SrcVal, EVT SrcTy,
+                                           EVT DestTy) {
+  EVT SrcVecTy;
+  EVT DestVecTy;
+  if (DestTy.bitsGT(SrcTy)) {
+    DestVecTy = getPackedSVEVectorVT(DestTy);
+    SrcVecTy = DestVecTy.changeVectorElementType(SrcTy);
+  } else {
+    SrcVecTy = getPackedSVEVectorVT(SrcTy);
+    DestVecTy = SrcVecTy.changeVectorElementType(DestTy);
+  }
+  SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
+  SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
+                            DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
+  Vec = DAG.getNode(Opc, DL, DestVecTy, Vec);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Vec, ZeroIdx);
+}
+
+/// Tries to replace scalar FP <-> conversions with SVE in streaming functions.
+static SDValue
+tryReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const AArch64Subtarget *Subtarget) {
+  // Uncomment to introduce extra fcvts.
+  // if (DCI.isBeforeLegalizeOps())
+  //   return SDValue();
+
+  if (N->isStrictFPOpcode())
+    return SDValue();
+
   auto isSupportedType = [](EVT VT) {
     if (!VT.isSimple())
       return false;
@@ -18973,54 +19003,52 @@ shouldUseSVEForScalarFPConversion(SDNode *N,
     return is_contained({MVT::i32, MVT::i64, MVT::f16, MVT::f32, MVT::f64},
                         VT.getSimpleVT().SimpleTy);
   };
+
+  if (!isSupportedType(N->getValueType(0)) ||
+      !isSupportedType(N->getOperand(0).getValueType()))
+    return SDValue();
+
   // If we are in a streaming[-compatible] function, use SVE for scalar FP <->
-  // INT conversions as this can help avoid movs between GPRs and FPRs, which
+  // INT conversions as this can help avoid moves between GPRs and FPRs, which
   // could be quite expensive.
-  return !N->isStrictFPOpcode() && Subtarget->isSVEorStreamingSVEAvailable() &&
-         (Subtarget->isStreaming() || Subtarget->isStreamingCompatible()) &&
-         isSupportedType(N->getValueType(0)) &&
-         isSupportedType(N->getOperand(0).getValueType());
-}
+  if (!Subtarget->isSVEorStreamingSVEAvailable() ||
+      (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
+    return SDValue();
 
-/// Replaces a scalar FP <-> INT conversion with an SVE (scalable) one, wrapped
-/// with an insert and extract.
-static SDValue replaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG) {
-  assert(!N->isStrictFPOpcode() && "strict fp ops not supported");
+  SDLoc DL(N);
+  unsigned Opc = N->getOpcode();
   SDValue SrcVal = N->getOperand(0);
   EVT SrcTy = SrcVal.getValueType();
   EVT DestTy = N->getValueType(0);
-  EVT SrcVecTy;
-  EVT DestVecTy;
-  // Use a packed vector for the larger type.
-  // Note: For conversions such as FCVTZS_ZPmZ_DtoS, and UCVTF_ZPmZ_StoD that
-  // notionally take or return a nxv2i32 type we must instead use a nxv4i32, as
-  // (unlike floats) nxv2i32 is an illegal unpacked type.
-  if (DestTy.bitsGT(SrcTy)) {
-    DestVecTy = getPackedSVEVectorVT(DestTy);
-    SrcVecTy = SrcTy == MVT::i32 ? getPackedSVEVectorVT(SrcTy)
-                                 : DestVecTy.changeVectorElementType(SrcTy);
-  } else {
-    SrcVecTy = getPackedSVEVectorVT(SrcTy);
-    DestVecTy = DestTy == MVT::i32 ? getPackedSVEVectorVT(DestTy)
-                                   : SrcVecTy.changeVectorElementType(DestTy);
+
+  // Conversions between f64 and i32 are a special case as nxv2i32 is an illegal
+  // type (unlike the equivalent nxv2f32 for floating-point types).
+  // May materialize extra instructions :(
+  if (SrcTy == MVT::i32 && DestTy == MVT::f64) {
+    SDValue ExtSrc = DAG.getNode(Opc == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND
+                                                        : ISD::ZERO_EXTEND,
+                                 DL, MVT::i64, SrcVal);
+    return createScalarSVEFPConversion(DAG, Opc, DL, ExtSrc, MVT::i64,
+                                       MVT::f64);
   }
-  SDLoc dl(N);
-  SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
-  SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SrcVecTy,
-                            DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
-  Vec = DAG.getNode(N->getOpcode(), dl, DestVecTy, Vec);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DestTy, Vec, ZeroIdx);
+  if (SrcTy == MVT::f64 && DestTy == MVT::i32) {
+    SDValue ExtDest =
+        createScalarSVEFPConversion(DAG, Opc, DL, SrcVal, MVT::f64, MVT::i64);
+    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, ExtDest);
+  }
+  return createScalarSVEFPConversion(DAG, Opc, DL, SrcVal, SrcTy, DestTy);
 }
 
 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI,
                                      const AArch64Subtarget *Subtarget) {
   // First try to optimize away the conversion when it's conditionally from
   // a constant. Vectors only.
   if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
     return Res;
 
-  if (shouldUseSVEForScalarFPConversion(N, Subtarget))
-    return replaceScalarFPConversionWithSVE(N, DAG);
+  if (SDValue Res = tryReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
+    return Res;
 
   EVT VT = N->getValueType(0);
   if (VT != MVT::f32 && VT != MVT::f64)
@@ -19060,8 +19088,8 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const AArch64Subtarget *Subtarget) {
-  if (shouldUseSVEForScalarFPConversion(N, Subtarget))
-    return replaceScalarFPConversionWithSVE(N, DAG);
+  if (SDValue Res = tryReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
+    return Res;
 
   if (!Subtarget->isNeonAvailable())
     return SDValue();
@@ -26082,7 +26110,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performMulCombine(N, DAG, DCI, Subtarget);
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
-    return performIntToFpCombine(N, DAG, Subtarget);
+    return performIntToFpCombine(N, DAG, DCI, Subtarget);
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
   case ISD::FP_TO_SINT_SAT:
@@ -28384,21 +28412,7 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
                                                    unsigned NewOp) const {
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
-  SDValue Pg;
-
-  // FCVTZS_ZPmZ_DtoS and FCVTZU_ZPmZ_DtoS are special cases. These operations
-  // return nxv4i32 rather than the correct nxv2i32, as nxv2i32 is an illegal
-  // unpacked type. So, in this case, we take the predicate size from the
-  // operand.
-  SDValue LastOp{};
-  if ((NewOp == AArch64ISD::FCVTZU_MERGE_PASSTHRU ||
-       NewOp == AArch64ISD::FCVTZS_MERGE_PASSTHRU) &&
-      VT == MVT::nxv4i32 &&
-      (LastOp = Op->ops().back().get()).getValueType() == MVT::nxv2f64) {
-    Pg = getPredicateForVector(DAG, DL, LastOp.getValueType());
-  } else {
-    Pg = getPredicateForVector(DAG, DL, VT);
-  }
+  auto Pg = getPredicateForVector(DAG, DL, VT);
 
   if (VT.isFixedLengthVector()) {
     assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index dfff9c627540be..78249052449059 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2328,8 +2328,8 @@ let Predicates = [HasSVEorSME] in {
   defm FCVT_ZPmZ_HtoD   : sve_fp_2op_p_zd< 0b1101001, "fcvt",   ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16,   AArch64fcvte_mt,  nxv2f64, nxv2i1, nxv2f16, ElementSizeD>;
   defm FCVT_ZPmZ_DtoS   : sve_fp_2op_p_zdr<0b1101010, "fcvt",   ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64,   AArch64fcvtr_mt,  nxv2f32, nxv2i1, nxv2f64, ElementSizeD>;
   defm FCVT_ZPmZ_StoD   : sve_fp_2op_p_zd< 0b1101011, "fcvt",   ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32,   AArch64fcvte_mt,  nxv2f64, nxv2i1, nxv2f32, ElementSizeD>;
-  defm SCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd< 0b1110000, "scvtf",  ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32,  AArch64scvtf_mt,  nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
-  defm UCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd< 0b1110001, "ucvtf",  ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32,  AArch64ucvtf_mt,  nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
+  defm SCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd< 0b1110000, "scvtf",  ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32,  null_frag,        nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
+  defm UCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd< 0b1110001, "ucvtf",  ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32,  null_frag,        nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
   defm UCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd< 0b0110101, "ucvtf",  ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32,  AArch64ucvtf_mt,  nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
   defm SCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd< 0b1110100, "scvtf",  ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64,  AArch64scvtf_mt,  nxv2f32, nxv2i1, nxv2i64, ElementSizeD>;
   defm SCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd< 0b0110100, "scvtf",  ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32,  AArch64scvtf_mt,  nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
@@ -2338,8 +2338,8 @@ let Predicates = [HasSVEorSME] in {
   defm UCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd< 0b0110111, "ucvtf",  ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64,  AArch64ucvtf_mt,  nxv2f16, nxv2i1, nxv2i64, ElementSizeD>;
   defm SCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd< 0b1110110, "scvtf",  ZPR64, ZPR64, null_frag,                     AArch64scvtf_mt,  nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
   defm UCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd< 0b1110111, "ucvtf",  ZPR64, ZPR64, null_frag,                     AArch64ucvtf_mt,  nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
-  defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, AArch64fcvtzs_mt, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
-  defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, AArch64fcvtzu_mt, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, null_frag,        nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, null_frag,        nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
   defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>;
   defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>;
   defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll
index 300ccefc71c91e..6ef5a0b985b59b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll
@@ -84,8 +84,9 @@ define i32 @f64_to_s32(double %x) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.d
-; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: f64_to_s32:
@@ -194,8 +195,9 @@ define i32 @f64_to_u32(double %x) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.d
-; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: f64_to_u32:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll
index 0a00ce69587b0d..59b6a4a69e5d19 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll
@@ -45,9 +45,11 @@ entry:
 define double @s32_to_f64(i32 %x) {
 ; CHECK-LABEL: s32_to_f64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x8, w0
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    scvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
@@ -100,9 +102,10 @@ entry:
 define double @u32_to_f64(i32 %x) {
 ; CHECK-LABEL: u32_to_f64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index 5e162fbfef196b..4add5d8a23ac9b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -1166,7 +1166,7 @@ define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z0.h, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -2867,7 +2867,7 @@ define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z0.h, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index 24ad0f502dbf33..18d4209bb76e42 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -392,8 +392,8 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and w8, w8, #0xffff
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
@@ -2836,10 +2836,10 @@ define float @scvtf_i16_f32(ptr %0) {
 define double @scvtf_i16_f64(ptr %0) {
 ; CHECK-LABEL: scvtf_i16_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrsh w8, [x0]
+; CHECK-NEXT:    ldrsh x8, [x0]
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    scvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
@@ -2895,9 +2895,10 @@ define float @scvtf_i32_f32(ptr %0) {
 define double @scvtf_i32_f64(ptr %0) {
 ; CHECK-LABEL: scvtf_i32_f64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrsw x8, [x0]
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    scvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
@@ -3015,8 +3016,8 @@ define double @ucvtf_i16_f64(ptr %0) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w8, [x0]
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
@@ -3072,9 +3073,10 @@ define float @ucvtf_i32_f32(ptr %0) {
 define double @ucvtf_i32_f64(ptr %0) {
 ; CHECK-LABEL: ucvtf_i32_f64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;

>From b81e8db670a3cfdbcb2da5c07849bd8330a4aee5 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 24 Oct 2024 16:54:27 +0000
Subject: [PATCH 8/8] Use intrinsics

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 89 +++++++++----------
 .../sve-streaming-mode-cvt-fp-to-int.ll       | 10 +--
 .../sve-streaming-mode-cvt-int-to-fp.ll       | 11 +--
 ...e-streaming-mode-fixed-length-fp-to-int.ll |  4 +-
 ...e-streaming-mode-fixed-length-int-to-fp.ll | 24 +++--
 5 files changed, 65 insertions(+), 73 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f29605f62ee6f8..2b9420dd100036 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18961,36 +18961,10 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
   return SDValue();
 }
 
-/// Creates a scalar FP <-> INT conversion with a scalable one, wrapped
-/// with an insert and extract.
-static SDValue createScalarSVEFPConversion(SelectionDAG &DAG, unsigned Opc,
-                                           SDLoc DL, SDValue SrcVal, EVT SrcTy,
-                                           EVT DestTy) {
-  EVT SrcVecTy;
-  EVT DestVecTy;
-  if (DestTy.bitsGT(SrcTy)) {
-    DestVecTy = getPackedSVEVectorVT(DestTy);
-    SrcVecTy = DestVecTy.changeVectorElementType(SrcTy);
-  } else {
-    SrcVecTy = getPackedSVEVectorVT(SrcTy);
-    DestVecTy = SrcVecTy.changeVectorElementType(DestTy);
-  }
-  SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
-  SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
-                            DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
-  Vec = DAG.getNode(Opc, DL, DestVecTy, Vec);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Vec, ZeroIdx);
-}
-
 /// Tries to replace scalar FP <-> conversions with SVE in streaming functions.
 static SDValue
 tryReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG,
-                                    TargetLowering::DAGCombinerInfo &DCI,
                                     const AArch64Subtarget *Subtarget) {
-  // Uncomment to introduce extra fcvts.
-  // if (DCI.isBeforeLegalizeOps())
-  //   return SDValue();
-
   if (N->isStrictFPOpcode())
     return SDValue();
 
@@ -19015,39 +18989,64 @@ tryReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG,
       (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
     return SDValue();
 
-  SDLoc DL(N);
   unsigned Opc = N->getOpcode();
+  bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::FP_TO_SINT;
+
   SDValue SrcVal = N->getOperand(0);
   EVT SrcTy = SrcVal.getValueType();
   EVT DestTy = N->getValueType(0);
 
-  // Conversions between f64 and i32 are a special case as nxv2i32 is an illegal
-  // type (unlike the equivalent nxv2f32 for floating-point types).
-  // May materialize extra instructions :(
-  if (SrcTy == MVT::i32 && DestTy == MVT::f64) {
-    SDValue ExtSrc = DAG.getNode(Opc == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND
-                                                        : ISD::ZERO_EXTEND,
-                                 DL, MVT::i64, SrcVal);
-    return createScalarSVEFPConversion(DAG, Opc, DL, ExtSrc, MVT::i64,
-                                       MVT::f64);
+  EVT SrcVecTy;
+  EVT DestVecTy;
+  if (DestTy.bitsGT(SrcTy)) {
+    DestVecTy = getPackedSVEVectorVT(DestTy);
+    SrcVecTy = SrcTy == MVT::i32 ? getPackedSVEVectorVT(SrcTy)
+                                 : DestVecTy.changeVectorElementType(SrcTy);
+  } else {
+    SrcVecTy = getPackedSVEVectorVT(SrcTy);
+    DestVecTy = DestTy == MVT::i32 ? getPackedSVEVectorVT(DestTy)
+                                   : SrcVecTy.changeVectorElementType(DestTy);
   }
-  if (SrcTy == MVT::f64 && DestTy == MVT::i32) {
-    SDValue ExtDest =
-        createScalarSVEFPConversion(DAG, Opc, DL, SrcVal, MVT::f64, MVT::i64);
-    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, ExtDest);
+
+  SDLoc DL(N);
+  SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
+  SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
+                            DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
+
+  // Conversions between f64 and i32 are a special case as nxv2i32 is an illegal
+  // type (unlike the equivalent nxv2f32 for floating-point types). So,
+  // unfortunately, the only way to lower to these variants is via the
+  // intrinsics. Note: We could sign/zero extend to the i64 variant, but that
+  // may result in extra extends or fmovs in the final assembly.
+  bool IsI32ToF64 = SrcTy == MVT::i32 && DestTy == MVT::f64;
+  bool isF64ToI32 = SrcTy == MVT::f64 && DestTy == MVT::i32;
+  if (IsI32ToF64 || isF64ToI32) {
+    unsigned IntrinsicOpc;
+    if (IsI32ToF64)
+      IntrinsicOpc = IsSigned ? Intrinsic::aarch64_sve_scvtf_f64i32
+                              : Intrinsic::aarch64_sve_ucvtf_f64i32;
+    else
+      IntrinsicOpc = IsSigned ? Intrinsic::aarch64_sve_fcvtzs_i32f64
+                              : Intrinsic::aarch64_sve_fcvtzu_i32f64;
+    SDValue PTrue = getPredicateForVector(DAG, DL, MVT::nxv2f64);
+    Vec = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, DestVecTy,
+                      {DAG.getConstant(IntrinsicOpc, DL, MVT::i32),
+                       DAG.getUNDEF(DestTy), PTrue, Vec});
+  } else {
+    Vec = DAG.getNode(Opc, DL, DestVecTy, Vec);
   }
-  return createScalarSVEFPConversion(DAG, Opc, DL, SrcVal, SrcTy, DestTy);
+
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Vec, ZeroIdx);
 }
 
 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
-                                     TargetLowering::DAGCombinerInfo &DCI,
                                      const AArch64Subtarget *Subtarget) {
   // First try to optimize away the conversion when it's conditionally from
   // a constant. Vectors only.
   if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
     return Res;
 
-  if (SDValue Res = tryReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
+  if (SDValue Res = tryReplaceScalarFPConversionWithSVE(N, DAG, Subtarget))
     return Res;
 
   EVT VT = N->getValueType(0);
@@ -19088,7 +19087,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const AArch64Subtarget *Subtarget) {
-  if (SDValue Res = tryReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
+  if (SDValue Res = tryReplaceScalarFPConversionWithSVE(N, DAG, Subtarget))
     return Res;
 
   if (!Subtarget->isNeonAvailable())
@@ -26110,7 +26109,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performMulCombine(N, DAG, DCI, Subtarget);
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
-    return performIntToFpCombine(N, DAG, DCI, Subtarget);
+    return performIntToFpCombine(N, DAG, Subtarget);
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
   case ISD::FP_TO_SINT_SAT:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll
index 6ef5a0b985b59b..300ccefc71c91e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll
@@ -84,9 +84,8 @@ define i32 @f64_to_s32(double %x) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.d
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: f64_to_s32:
@@ -195,9 +194,8 @@ define i32 @f64_to_u32(double %x) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.d
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: f64_to_u32:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll
index 59b6a4a69e5d19..0a00ce69587b0d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll
@@ -45,11 +45,9 @@ entry:
 define double @s32_to_f64(i32 %x) {
 ; CHECK-LABEL: s32_to_f64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    sxtw x8, w0
+; CHECK-NEXT:    fmov s0, w0
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
@@ -102,10 +100,9 @@ entry:
 define double @u32_to_f64(i32 %x) {
 ; CHECK-LABEL: u32_to_f64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    fmov s0, w0
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index 4add5d8a23ac9b..5e162fbfef196b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -1166,7 +1166,7 @@ define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.d
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z0.h, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -2867,7 +2867,7 @@ define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.d
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z0.h, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index 18d4209bb76e42..24ad0f502dbf33 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -392,8 +392,8 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and w8, w8, #0xffff
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
@@ -2836,10 +2836,10 @@ define float @scvtf_i16_f32(ptr %0) {
 define double @scvtf_i16_f64(ptr %0) {
 ; CHECK-LABEL: scvtf_i16_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrsh x8, [x0]
+; CHECK-NEXT:    ldrsh w8, [x0]
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
@@ -2895,10 +2895,9 @@ define float @scvtf_i32_f32(ptr %0) {
 define double @scvtf_i32_f64(ptr %0) {
 ; CHECK-LABEL: scvtf_i32_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrsw x8, [x0]
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
@@ -3016,8 +3015,8 @@ define double @ucvtf_i16_f64(ptr %0) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w8, [x0]
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
@@ -3073,10 +3072,9 @@ define float @ucvtf_i32_f32(ptr %0) {
 define double @ucvtf_i32_f64(ptr %0) {
 ; CHECK-LABEL: ucvtf_i32_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;



More information about the llvm-commits mailing list