[llvm] 9620ce9 - [RISCV] Support fixed-length vector FP_ROUND & FP_EXTEND

Thu Feb 25 04:22:16 PST 2021

Author: Fraser Cormack
Date: 2021-02-25T12:16:06Z
New Revision: 9620ce90d7238c5ff450a83b49cbc4b811d19830

URL: https://github.com/llvm/llvm-project/commit/9620ce90d7238c5ff450a83b49cbc4b811d19830
DIFF: https://github.com/llvm/llvm-project/commit/9620ce90d7238c5ff450a83b49cbc4b811d19830.diff

LOG: [RISCV] Support fixed-length vector FP_ROUND & FP_EXTEND

This patch extends the support for vector FP_ROUND and FP_EXTEND by
including support for fixed-length vector types. Since fixed-length
vectors use "VL" nodes and scalable vectors can use the standard nodes,
there is slightly more to do in the fixed-length case. A helper function
was introduced to try and reduce the divergent paths. It is expected
that this function will similarly come in useful for lowering the
int-to-fp and fp-to-int operations for fixed-length vectors.

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D97301

Added: 
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll

Modified: 
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/lib/Target/RISCV/RISCVISelLowering.h
    llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
    llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f18966706639..69610218786d 100644

--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -588,6 +588,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         // By default everything must be expanded.
         for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
           setOperationAction(Op, VT, Expand);
+        for (MVT OtherVT : MVT::fp_fixedlen_vector_valuetypes()) {
+          setLoadExtAction(ISD::EXTLOAD, OtherVT, VT, Expand);
+          setTruncStoreAction(VT, OtherVT, Expand);
+        }
 
         // We use EXTRACT_SUBVECTOR as a "cast" from scalable to fixed.
         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
@@ -606,6 +610,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         setOperationAction(ISD::FSQRT, VT, Custom);
         setOperationAction(ISD::FMA, VT, Custom);
 
+        setOperationAction(ISD::FP_ROUND, VT, Custom);
+        setOperationAction(ISD::FP_EXTEND, VT, Custom);
+
         for (auto CC : VFPCCToExpand)
           setCondCodeAction(CC, VT, Expand);
 
@@ -1081,6 +1088,21 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue getRVVFPExtendOrRound(SDValue Op, MVT VT, MVT ContainerVT,
+                                     SDLoc DL, SelectionDAG &DAG,
+                                     const RISCVSubtarget &Subtarget) {
+  if (VT.isScalableVector())
+    return DAG.getFPExtendOrRound(Op, DL, VT);
+  assert(VT.isFixedLengthVector() &&
+         "Unexpected value type for RVV FP extend/round lowering");
+  SDValue Mask, VL;
+  std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+  unsigned RVVOpc = ContainerVT.bitsGT(Op.getSimpleValueType())
+                        ? RISCVISD::FP_EXTEND_VL
+                        : RISCVISD::FP_ROUND_VL;
+  return DAG.getNode(RVVOpc, DL, ContainerVT, Op, Mask, VL);
+}
+
 SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
                                             SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
@@ -1254,33 +1276,86 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     // RVV can only do fp_extend to types double the size as the source. We
     // custom-lower f16->f64 extensions to two hops of ISD::FP_EXTEND, going
     // via f32.
+    SDLoc DL(Op);
     MVT VT = Op.getSimpleValueType();
-    MVT SrcVT = Op.getOperand(0).getSimpleValueType();
-    // We only need to close the gap between vXf16->vXf64.
+    SDValue Src = Op.getOperand(0);
+    MVT SrcVT = Src.getSimpleValueType();
+
+    // Prepare any fixed-length vector operands.
+    MVT ContainerVT = VT;
+    if (SrcVT.isFixedLengthVector()) {
+      ContainerVT = RISCVTargetLowering::getContainerForFixedLengthVector(
+          DAG, VT, Subtarget);
+      MVT SrcContainerVT =
+          ContainerVT.changeVectorElementType(SrcVT.getVectorElementType());
+      Src = convertToScalableVector(SrcContainerVT, Src, DAG, Subtarget);
+    }
+
     if (!VT.isVector() || VT.getVectorElementType() != MVT::f64 ||
-        SrcVT.getVectorElementType() != MVT::f16)
-      return Op;
-    SDLoc DL(Op);
-    MVT InterVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
-    SDValue IntermediateRound =
-        DAG.getFPExtendOrRound(Op.getOperand(0), DL, InterVT);
-    return DAG.getFPExtendOrRound(IntermediateRound, DL, VT);
+        SrcVT.getVectorElementType() != MVT::f16) {
+      // For scalable vectors, we only need to close the gap between
+      // vXf16->vXf64.
+      if (!VT.isFixedLengthVector())
+        return Op;
+      // For fixed-length vectors, lower the FP_EXTEND to a custom "VL" version.
+      Src = getRVVFPExtendOrRound(Src, VT, ContainerVT, DL, DAG, Subtarget);
+      return convertFromScalableVector(VT, Src, DAG, Subtarget);
+    }
+
+    MVT InterVT = VT.changeVectorElementType(MVT::f32);
+    MVT InterContainerVT = ContainerVT.changeVectorElementType(MVT::f32);
+    SDValue IntermediateExtend = getRVVFPExtendOrRound(
+        Src, InterVT, InterContainerVT, DL, DAG, Subtarget);
+
+    SDValue Extend = getRVVFPExtendOrRound(IntermediateExtend, VT, ContainerVT,
+                                           DL, DAG, Subtarget);
+    if (VT.isFixedLengthVector())
+      return convertFromScalableVector(VT, Extend, DAG, Subtarget);
+    return Extend;
   }
   case ISD::FP_ROUND: {
     // RVV can only do fp_round to types half the size as the source. We
     // custom-lower f64->f16 rounds via RVV's round-to-odd float
     // conversion instruction.
+    SDLoc DL(Op);
     MVT VT = Op.getSimpleValueType();
-    MVT SrcVT = Op.getOperand(0).getSimpleValueType();
-    // We only need to close the gap between vXf64<->vXf16.
+    SDValue Src = Op.getOperand(0);
+    MVT SrcVT = Src.getSimpleValueType();
+
+    // Prepare any fixed-length vector operands.
+    MVT ContainerVT = VT;
+    if (VT.isFixedLengthVector()) {
+      MVT SrcContainerVT =
+          RISCVTargetLowering::getContainerForFixedLengthVector(DAG, SrcVT,
+                                                                Subtarget);
+      ContainerVT =
+          SrcContainerVT.changeVectorElementType(VT.getVectorElementType());
+      Src = convertToScalableVector(SrcContainerVT, Src, DAG, Subtarget);
+    }
+
     if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
-        SrcVT.getVectorElementType() != MVT::f64)
-      return Op;
-    SDLoc DL(Op);
-    MVT InterVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
+        SrcVT.getVectorElementType() != MVT::f64) {
+      // For scalable vectors, we only need to close the gap between
+      // vXf64<->vXf16.
+      if (!VT.isFixedLengthVector())
+        return Op;
+      // For fixed-length vectors, lower the FP_ROUND to a custom "VL" version.
+      Src = getRVVFPExtendOrRound(Src, VT, ContainerVT, DL, DAG, Subtarget);
+      return convertFromScalableVector(VT, Src, DAG, Subtarget);
+    }
+
+    SDValue Mask, VL;
+    std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+
+    MVT InterVT = ContainerVT.changeVectorElementType(MVT::f32);
     SDValue IntermediateRound =
-        DAG.getNode(RISCVISD::VFNCVT_ROD, DL, InterVT, Op.getOperand(0));
-    return DAG.getFPExtendOrRound(IntermediateRound, DL, VT);
+        DAG.getNode(RISCVISD::VFNCVT_ROD_VL, DL, InterVT, Src, Mask, VL);
+    SDValue Round = getRVVFPExtendOrRound(IntermediateRound, VT, ContainerVT,
+                                          DL, DAG, Subtarget);
+
+    if (VT.isFixedLengthVector())
+      return convertFromScalableVector(VT, Round, DAG, Subtarget);
+    return Round;
   }
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
@@ -5460,7 +5535,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(VSLIDEUP_VL)
   NODE_NAME_CASE(VSLIDEDOWN_VL)
   NODE_NAME_CASE(VID_VL)
-  NODE_NAME_CASE(VFNCVT_ROD)
+  NODE_NAME_CASE(VFNCVT_ROD_VL)
   NODE_NAME_CASE(VECREDUCE_ADD)
   NODE_NAME_CASE(VECREDUCE_UMAX)
   NODE_NAME_CASE(VECREDUCE_SMAX)
@@ -5498,6 +5573,8 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(UMAX_VL)
   NODE_NAME_CASE(MULHS_VL)
   NODE_NAME_CASE(MULHU_VL)
+  NODE_NAME_CASE(FP_ROUND_VL)
+  NODE_NAME_CASE(FP_EXTEND_VL)
   NODE_NAME_CASE(SETCC_VL)
   NODE_NAME_CASE(VSELECT_VL)
   NODE_NAME_CASE(VMAND_VL)

diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index a75ebc38cc2e..606d171dbb59 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -122,8 +122,9 @@ enum NodeType : unsigned {
   VID_VL,
   // Matches the semantics of the vfcnvt.rod function (Convert double-width
   // float to single-width float, rounding towards odd). Takes a double-width
-  // float vector and produces a single-width float vector.
-  VFNCVT_ROD,
+  // float vector and produces a single-width float vector. Also has a mask and
+  // VL operand.
+  VFNCVT_ROD_VL,
   // These nodes match the semantics of the corresponding RVV vector reduction
   // instructions. They produce a vector result which is the reduction
   // performed over the first vector operand plus the first element of the
@@ -175,6 +176,8 @@ enum NodeType : unsigned {
   UMAX_VL,
   MULHS_VL,
   MULHU_VL,
+  FP_ROUND_VL,
+  FP_EXTEND_VL,
 
   // Vector compare producing a mask. Fourth operand is input mask. Fifth
   // operand is VL.

diff  --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index c552865c6ec9..a988a0ed2aaf 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -538,10 +538,6 @@ foreach mti = AllMasks in {
 
 } // Predicates = [HasStdExtV]
 
-def riscv_fncvt_rod
-    : SDNode<"RISCVISD::VFNCVT_ROD",
-             SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>, []>;
-
 let Predicates = [HasStdExtV, HasStdExtF] in {
 
 // 14.2. Vector Single-Width Floating-Point Add/Subtract Instructions
@@ -719,12 +715,7 @@ foreach fvtiToFWti = AllWidenableFloatVectors in {
   def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))),
             (!cast<Instruction>("PseudoVFNCVT_F_F_W_"#fvti.LMul.MX)
                 fwti.RegClass:$rs1, fvti.AVL, fvti.SEW)>;
-
-  def : Pat<(fvti.Vector (riscv_fncvt_rod (fwti.Vector fwti.RegClass:$rs1))),
-            (!cast<Instruction>("PseudoVFNCVT_ROD_F_F_W_"#fvti.LMul.MX)
-                fwti.RegClass:$rs1, fvti.AVL, fvti.SEW)>;
 }
-
 } // Predicates = [HasStdExtV, HasStdExtF]
 
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index 2d5f8fa447fc..e1543cabf833 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -96,6 +96,20 @@ def SDT_RISCVVecFMA_VL : SDTypeProfile<1, 5, [SDTCisSameAs<0, 1>,
                                               SDTCisVT<5, XLenVT>]>;
 def riscv_fma_vl : SDNode<"RISCVISD::FMA_VL", SDT_RISCVVecFMA_VL, [SDNPCommutative]>;
 
+def SDT_RISCVFPRoundOp_VL  : SDTypeProfile<1, 3, [
+  SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<0, 1>, SDTCisSameNumEltsAs<0, 1>,
+  SDTCVecEltisVT<2, i1>, SDTCisSameNumEltsAs<1, 2>, SDTCisVT<3, XLenVT>
+]>;
+def SDT_RISCVFPExtendOp_VL  : SDTypeProfile<1, 3, [
+  SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<1, 0>, SDTCisSameNumEltsAs<0, 1>,
+  SDTCVecEltisVT<2, i1>, SDTCisSameNumEltsAs<1, 2>, SDTCisVT<3, XLenVT>
+]>;
+
+def riscv_fpround_vl : SDNode<"RISCVISD::FP_ROUND_VL", SDT_RISCVFPRoundOp_VL>;
+def riscv_fpextend_vl : SDNode<"RISCVISD::FP_EXTEND_VL", SDT_RISCVFPExtendOp_VL>;
+def riscv_fncvt_rod_vl : SDNode<"RISCVISD::VFNCVT_ROD_VL", SDT_RISCVFPRoundOp_VL>;
+
+
 def riscv_setcc_vl : SDNode<"RISCVISD::SETCC_VL",
                             SDTypeProfile<1, 5, [SDTCVecEltisVT<0, i1>,
                                                  SDTCisVec<1>,
@@ -740,6 +754,33 @@ foreach fvti = AllFloatVectors in {
                                 fvti.LMul.MX)
              (fvti.Scalar fvti.ScalarRegClass:$rs2),
              GPR:$vl, fvti.SEW)>;
+
+  // 14.18. Widening Floating-Point/Integer Type-Convert Instructions
+  foreach fvtiToFWti = AllWidenableFloatVectors in {
+    defvar fvti = fvtiToFWti.Vti;
+    defvar fwti = fvtiToFWti.Wti;
+    def : Pat<(fwti.Vector (riscv_fpextend_vl (fvti.Vector fvti.RegClass:$rs1),
+                                              (fvti.Mask true_mask),
+                                              (XLenVT (VLOp GPR:$vl)))),
+              (!cast<Instruction>("PseudoVFWCVT_F_F_V_"#fvti.LMul.MX)
+                  fvti.RegClass:$rs1, GPR:$vl, fvti.SEW)>;
+  }
+
+  foreach fvtiToFWti = AllWidenableFloatVectors in {
+    defvar fvti = fvtiToFWti.Vti;
+    defvar fwti = fvtiToFWti.Wti;
+    def : Pat<(fvti.Vector (riscv_fpround_vl (fwti.Vector fwti.RegClass:$rs1),
+                                             (fwti.Mask true_mask),
+                                             (XLenVT (VLOp GPR:$vl)))),
+              (!cast<Instruction>("PseudoVFNCVT_F_F_W_"#fvti.LMul.MX)
+                  fwti.RegClass:$rs1, GPR:$vl, fvti.SEW)>;
+
+    def : Pat<(fvti.Vector (riscv_fncvt_rod_vl (fwti.Vector fwti.RegClass:$rs1),
+                                               (fwti.Mask true_mask),
+                                               (XLenVT (VLOp GPR:$vl)))),
+              (!cast<Instruction>("PseudoVFNCVT_ROD_F_F_W_"#fvti.LMul.MX)
+                  fwti.RegClass:$rs1, GPR:$vl, fvti.SEW)>;
+  }
 }
 
 } // Predicates = [HasStdExtV, HasStdExtF]

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll
new file mode 100644
index 000000000000..db2628822816
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll
@@ -0,0 +1,267 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+experimental-v,+experimental-zfh,+f,+d -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=8 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX8
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+experimental-v,+experimental-zfh,+f,+d -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=8 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX8
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+experimental-v,+experimental-zfh,+f,+d -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+experimental-v,+experimental-zfh,+f,+d -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1
+
+define void @fpext_v2f16_v2f32(<2 x half>* %x, <2 x float>* %y) {
+; CHECK-LABEL: fpext_v2f16_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli a2, 2, e16,m1,ta,mu
+; CHECK-NEXT:    vle16.v v25, (a0)
+; CHECK-NEXT:    vsetivli a0, 2, e16,mf2,ta,mu
+; CHECK-NEXT:    vfwcvt.f.f.v v26, v25
+; CHECK-NEXT:    vsetivli a0, 2, e32,m1,ta,mu
+; CHECK-NEXT:    vse32.v v26, (a1)
+; CHECK-NEXT:    ret
+  %a = load <2 x half>, <2 x half>* %x
+  %d = fpext <2 x half> %a to <2 x float>
+  store <2 x float> %d, <2 x float>* %y
+  ret void
+}
+
+define void @fpext_v2f16_v2f64(<2 x half>* %x, <2 x double>* %y) {
+; CHECK-LABEL: fpext_v2f16_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli a2, 2, e16,m1,ta,mu
+; CHECK-NEXT:    vle16.v v25, (a0)
+; CHECK-NEXT:    vsetivli a0, 2, e16,mf4,ta,mu
+; CHECK-NEXT:    vfwcvt.f.f.v v26, v25
+; CHECK-NEXT:    vsetivli a0, 2, e32,mf2,ta,mu
+; CHECK-NEXT:    vfwcvt.f.f.v v25, v26
+; CHECK-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
+; CHECK-NEXT:    vse64.v v25, (a1)
+; CHECK-NEXT:    ret
+  %a = load <2 x half>, <2 x half>* %x
+  %d = fpext <2 x half> %a to <2 x double>
+  store <2 x double> %d, <2 x double>* %y
+  ret void
+}
+
+define void @fpext_v8f16_v8f32(<8 x half>* %x, <8 x float>* %y) {
+; LMULMAX8-LABEL: fpext_v8f16_v8f32:
+; LMULMAX8:       # %bb.0:
+; LMULMAX8-NEXT:    vsetivli a2, 8, e16,m1,ta,mu
+; LMULMAX8-NEXT:    vle16.v v25, (a0)
+; LMULMAX8-NEXT:    vfwcvt.f.f.v v26, v25
+; LMULMAX8-NEXT:    vsetivli a0, 8, e32,m2,ta,mu
+; LMULMAX8-NEXT:    vse32.v v26, (a1)
+; LMULMAX8-NEXT:    ret
+;
+; LMULMAX1-LABEL: fpext_v8f16_v8f32:
+; LMULMAX1:       # %bb.0:
+; LMULMAX1-NEXT:    vsetivli a2, 8, e16,m1,ta,mu
+; LMULMAX1-NEXT:    vle16.v v25, (a0)
+; LMULMAX1-NEXT:    vsetivli a0, 4, e16,mf2,ta,mu
+; LMULMAX1-NEXT:    vfwcvt.f.f.v v26, v25
+; LMULMAX1-NEXT:    vsetivli a0, 4, e16,m1,ta,mu
+; LMULMAX1-NEXT:    vslidedown.vi v25, v25, 4
+; LMULMAX1-NEXT:    vsetivli a0, 4, e16,mf2,ta,mu
+; LMULMAX1-NEXT:    vfwcvt.f.f.v v27, v25
+; LMULMAX1-NEXT:    addi a0, a1, 16
+; LMULMAX1-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
+; LMULMAX1-NEXT:    vse32.v v27, (a0)
+; LMULMAX1-NEXT:    vse32.v v26, (a1)
+; LMULMAX1-NEXT:    ret
+  %a = load <8 x half>, <8 x half>* %x
+  %d = fpext <8 x half> %a to <8 x float>
+  store <8 x float> %d, <8 x float>* %y
+  ret void
+}
+
+define void @fpext_v8f16_v8f64(<8 x half>* %x, <8 x double>* %y) {
+; LMULMAX8-LABEL: fpext_v8f16_v8f64:
+; LMULMAX8:       # %bb.0:
+; LMULMAX8-NEXT:    vsetivli a2, 8, e16,m1,ta,mu
+; LMULMAX8-NEXT:    vle16.v v25, (a0)
+; LMULMAX8-NEXT:    vfwcvt.f.f.v v26, v25
+; LMULMAX8-NEXT:    vsetivli a0, 8, e32,m2,ta,mu
+; LMULMAX8-NEXT:    vfwcvt.f.f.v v28, v26
+; LMULMAX8-NEXT:    vsetivli a0, 8, e64,m4,ta,mu
+; LMULMAX8-NEXT:    vse64.v v28, (a1)
+; LMULMAX8-NEXT:    ret
+;
+; LMULMAX1-LABEL: fpext_v8f16_v8f64:
+; LMULMAX1:       # %bb.0:
+; LMULMAX1-NEXT:    vsetivli a2, 8, e16,m1,ta,mu
+; LMULMAX1-NEXT:    vle16.v v25, (a0)
+; LMULMAX1-NEXT:    vsetivli a0, 2, e16,m1,ta,mu
+; LMULMAX1-NEXT:    vslidedown.vi v26, v25, 2
+; LMULMAX1-NEXT:    vsetivli a0, 2, e16,mf4,ta,mu
+; LMULMAX1-NEXT:    vfwcvt.f.f.v v27, v26
+; LMULMAX1-NEXT:    vsetivli a0, 2, e32,mf2,ta,mu
+; LMULMAX1-NEXT:    vfwcvt.f.f.v v26, v27
+; LMULMAX1-NEXT:    vsetivli a0, 2, e16,mf4,ta,mu
+; LMULMAX1-NEXT:    vfwcvt.f.f.v v27, v25
+; LMULMAX1-NEXT:    vsetivli a0, 2, e32,mf2,ta,mu
+; LMULMAX1-NEXT:    vfwcvt.f.f.v v28, v27
+; LMULMAX1-NEXT:    vsetivli a0, 4, e16,m1,ta,mu
+; LMULMAX1-NEXT:    vslidedown.vi v25, v25, 4
+; LMULMAX1-NEXT:    vsetivli a0, 2, e16,mf4,ta,mu
+; LMULMAX1-NEXT:    vfwcvt.f.f.v v27, v25
+; LMULMAX1-NEXT:    vsetivli a0, 2, e32,mf2,ta,mu
+; LMULMAX1-NEXT:    vfwcvt.f.f.v v29, v27
+; LMULMAX1-NEXT:    vsetivli a0, 2, e16,m1,ta,mu
+; LMULMAX1-NEXT:    vslidedown.vi v25, v25, 2
+; LMULMAX1-NEXT:    vsetivli a0, 2, e16,mf4,ta,mu
+; LMULMAX1-NEXT:    vfwcvt.f.f.v v27, v25
+; LMULMAX1-NEXT:    vsetivli a0, 2, e32,mf2,ta,mu
+; LMULMAX1-NEXT:    vfwcvt.f.f.v v25, v27
+; LMULMAX1-NEXT:    addi a0, a1, 48
+; LMULMAX1-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
+; LMULMAX1-NEXT:    vse64.v v25, (a0)
+; LMULMAX1-NEXT:    addi a0, a1, 32
+; LMULMAX1-NEXT:    vse64.v v29, (a0)
+; LMULMAX1-NEXT:    vse64.v v28, (a1)
+; LMULMAX1-NEXT:    addi a0, a1, 16
+; LMULMAX1-NEXT:    vse64.v v26, (a0)
+; LMULMAX1-NEXT:    ret
+  %a = load <8 x half>, <8 x half>* %x
+  %d = fpext <8 x half> %a to <8 x double>
+  store <8 x double> %d, <8 x double>* %y
+  ret void
+}
+
+define void @fpround_v2f32_v2f16(<2 x float>* %x, <2 x half>* %y) {
+; CHECK-LABEL: fpround_v2f32_v2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli a2, 2, e32,m1,ta,mu
+; CHECK-NEXT:    vle32.v v25, (a0)
+; CHECK-NEXT:    vsetivli a0, 2, e16,mf2,ta,mu
+; CHECK-NEXT:    vfncvt.f.f.w v26, v25
+; CHECK-NEXT:    vsetivli a0, 2, e16,m1,ta,mu
+; CHECK-NEXT:    vse16.v v26, (a1)
+; CHECK-NEXT:    ret
+  %a = load <2 x float>, <2 x float>* %x
+  %d = fptrunc <2 x float> %a to <2 x half>
+  store <2 x half> %d, <2 x half>* %y
+  ret void
+}
+
+define void @fpround_v2f64_v2f16(<2 x double>* %x, <2 x half>* %y) {
+; CHECK-LABEL: fpround_v2f64_v2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
+; CHECK-NEXT:    vle64.v v25, (a0)
+; CHECK-NEXT:    vsetivli a0, 2, e32,mf2,ta,mu
+; CHECK-NEXT:    vfncvt.rod.f.f.w v26, v25
+; CHECK-NEXT:    vsetivli a0, 2, e16,mf4,ta,mu
+; CHECK-NEXT:    vfncvt.f.f.w v25, v26
+; CHECK-NEXT:    vsetivli a0, 2, e16,m1,ta,mu
+; CHECK-NEXT:    vse16.v v25, (a1)
+; CHECK-NEXT:    ret
+  %a = load <2 x double>, <2 x double>* %x
+  %d = fptrunc <2 x double> %a to <2 x half>
+  store <2 x half> %d, <2 x half>* %y
+  ret void
+}
+
+define void @fpround_v8f32_v8f16(<8 x float>* %x, <8 x half>* %y) {
+; LMULMAX8-LABEL: fpround_v8f32_v8f16:
+; LMULMAX8:       # %bb.0:
+; LMULMAX8-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
+; LMULMAX8-NEXT:    vle32.v v26, (a0)
+; LMULMAX8-NEXT:    vsetivli a0, 8, e16,m1,ta,mu
+; LMULMAX8-NEXT:    vfncvt.f.f.w v25, v26
+; LMULMAX8-NEXT:    vse16.v v25, (a1)
+; LMULMAX8-NEXT:    ret
+;
+; LMULMAX1-LABEL: fpround_v8f32_v8f16:
+; LMULMAX1:       # %bb.0:
+; LMULMAX1-NEXT:    addi sp, sp, -16
+; LMULMAX1-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
+; LMULMAX1-NEXT:    addi a2, a0, 16
+; LMULMAX1-NEXT:    vle32.v v25, (a2)
+; LMULMAX1-NEXT:    vle32.v v26, (a0)
+; LMULMAX1-NEXT:    vsetivli a0, 4, e16,mf2,ta,mu
+; LMULMAX1-NEXT:    vfncvt.f.f.w v27, v25
+; LMULMAX1-NEXT:    addi a0, sp, 8
+; LMULMAX1-NEXT:    vsetivli a2, 4, e16,m1,ta,mu
+; LMULMAX1-NEXT:    vse16.v v27, (a0)
+; LMULMAX1-NEXT:    vsetivli a0, 4, e16,mf2,ta,mu
+; LMULMAX1-NEXT:    vfncvt.f.f.w v25, v26
+; LMULMAX1-NEXT:    vsetivli a0, 4, e16,m1,ta,mu
+; LMULMAX1-NEXT:    vse16.v v25, (sp)
+; LMULMAX1-NEXT:    vsetivli a0, 8, e16,m1,ta,mu
+; LMULMAX1-NEXT:    vle16.v v25, (sp)
+; LMULMAX1-NEXT:    vse16.v v25, (a1)
+; LMULMAX1-NEXT:    addi sp, sp, 16
+; LMULMAX1-NEXT:    ret
+  %a = load <8 x float>, <8 x float>* %x
+  %d = fptrunc <8 x float> %a to <8 x half>
+  store <8 x half> %d, <8 x half>* %y
+  ret void
+}
+
+define void @fpround_v8f64_v8f16(<8 x double>* %x, <8 x half>* %y) {
+; LMULMAX8-LABEL: fpround_v8f64_v8f16:
+; LMULMAX8:       # %bb.0:
+; LMULMAX8-NEXT:    vsetivli a2, 8, e64,m4,ta,mu
+; LMULMAX8-NEXT:    vle64.v v28, (a0)
+; LMULMAX8-NEXT:    vsetivli a0, 8, e32,m2,ta,mu
+; LMULMAX8-NEXT:    vfncvt.rod.f.f.w v26, v28
+; LMULMAX8-NEXT:    vsetivli a0, 8, e16,m1,ta,mu
+; LMULMAX8-NEXT:    vfncvt.f.f.w v25, v26
+; LMULMAX8-NEXT:    vse16.v v25, (a1)
+; LMULMAX8-NEXT:    ret
+;
+; LMULMAX1-LABEL: fpround_v8f64_v8f16:
+; LMULMAX1:       # %bb.0:
+; LMULMAX1-NEXT:    addi sp, sp, -32
+; LMULMAX1-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
+; LMULMAX1-NEXT:    vle64.v v25, (a0)
+; LMULMAX1-NEXT:    addi a2, a0, 32
+; LMULMAX1-NEXT:    vle64.v v26, (a2)
+; LMULMAX1-NEXT:    addi a2, a0, 48
+; LMULMAX1-NEXT:    vle64.v v27, (a2)
+; LMULMAX1-NEXT:    addi a0, a0, 16
+; LMULMAX1-NEXT:    vle64.v v28, (a0)
+; LMULMAX1-NEXT:    vsetivli a0, 2, e32,mf2,ta,mu
+; LMULMAX1-NEXT:    vfncvt.rod.f.f.w v29, v27
+; LMULMAX1-NEXT:    vsetivli a0, 2, e16,mf4,ta,mu
+; LMULMAX1-NEXT:    vfncvt.f.f.w v27, v29
+; LMULMAX1-NEXT:    addi a0, sp, 12
+; LMULMAX1-NEXT:    vsetivli a2, 2, e16,m1,ta,mu
+; LMULMAX1-NEXT:    vse16.v v27, (a0)
+; LMULMAX1-NEXT:    vsetivli a0, 2, e32,mf2,ta,mu
+; LMULMAX1-NEXT:    vfncvt.rod.f.f.w v27, v28
+; LMULMAX1-NEXT:    vsetivli a0, 2, e16,mf4,ta,mu
+; LMULMAX1-NEXT:    vfncvt.f.f.w v28, v27
+; LMULMAX1-NEXT:    addi a0, sp, 4
+; LMULMAX1-NEXT:    vsetivli a2, 2, e16,m1,ta,mu
+; LMULMAX1-NEXT:    vse16.v v28, (a0)
+; LMULMAX1-NEXT:    vsetivli a0, 2, e32,mf2,ta,mu
+; LMULMAX1-NEXT:    vfncvt.rod.f.f.w v27, v26
+; LMULMAX1-NEXT:    vsetivli a0, 2, e16,mf4,ta,mu
+; LMULMAX1-NEXT:    vfncvt.f.f.w v26, v27
+; LMULMAX1-NEXT:    vsetivli a0, 2, e16,m1,ta,mu
+; LMULMAX1-NEXT:    addi a0, sp, 8
+; LMULMAX1-NEXT:    vse16.v v26, (a0)
+; LMULMAX1-NEXT:    vsetivli a0, 4, e16,m1,ta,mu
+; LMULMAX1-NEXT:    addi a0, sp, 8
+; LMULMAX1-NEXT:    vle16.v v26, (a0)
+; LMULMAX1-NEXT:    addi a0, sp, 24
+; LMULMAX1-NEXT:    vse16.v v26, (a0)
+; LMULMAX1-NEXT:    vsetivli a0, 2, e32,mf2,ta,mu
+; LMULMAX1-NEXT:    vfncvt.rod.f.f.w v26, v25
+; LMULMAX1-NEXT:    vsetivli a0, 2, e16,mf4,ta,mu
+; LMULMAX1-NEXT:    vfncvt.f.f.w v25, v26
+; LMULMAX1-NEXT:    vsetivli a0, 2, e16,m1,ta,mu
+; LMULMAX1-NEXT:    vse16.v v25, (sp)
+; LMULMAX1-NEXT:    vsetivli a0, 4, e16,m1,ta,mu
+; LMULMAX1-NEXT:    vle16.v v25, (sp)
+; LMULMAX1-NEXT:    addi a0, sp, 16
+; LMULMAX1-NEXT:    vse16.v v25, (a0)
+; LMULMAX1-NEXT:    vsetivli a0, 8, e16,m1,ta,mu
+; LMULMAX1-NEXT:    addi a0, sp, 16
+; LMULMAX1-NEXT:    vle16.v v25, (a0)
+; LMULMAX1-NEXT:    vse16.v v25, (a1)
+; LMULMAX1-NEXT:    addi sp, sp, 32
+; LMULMAX1-NEXT:    ret
+  %a = load <8 x double>, <8 x double>* %x
+  %d = fptrunc <8 x double> %a to <8 x half>
+  store <8 x half> %d, <8 x half>* %y
+  ret void
+}