[llvm] [SystemZ] Support fp16 vector ABI and basic codegen. (PR #171066)

Sun Dec 7 16:22:29 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-systemz

Author: Jonas Paulsson (JonPsson1)

<details>
<summary>Changes</summary>

- Make v8f16 a legal type so that arguments can be passed in vector registers. Handle fp16 vectors so that they have the same ABI as other fp vectors. 

- Set the preferred vector action for fp16 vectors to "split". This will scalarize all operations, which is not always necessary (like with memory operations), but it avoids the superfluous operations that result after first widening and then scalarizing a narrow vector (like v4f16).

This seems to be the better handling, at least if narrow (e.g. 4 element) vectors are considered and not supposed to get all 8 ops emitted. Patch in progress, with some tests in place that are passing. There are several opportunities for optimizing the results, but not sure if or what would be relevant at this point, with the conversion routines having a heavy overhead.

Fixes #168992



---

Patch is 135.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/171066.diff


12 Files Affected:

- (modified) llvm/lib/Target/SystemZ/SystemZCallingConv.td (+4-4) 
- (modified) llvm/lib/Target/SystemZ/SystemZISelLowering.cpp (+72-15) 
- (modified) llvm/lib/Target/SystemZ/SystemZISelLowering.h (+8-16) 
- (modified) llvm/lib/Target/SystemZ/SystemZInstrVector.td (+26) 
- (modified) llvm/lib/Target/SystemZ/SystemZRegisterInfo.td (+3-3) 
- (modified) llvm/test/CodeGen/SystemZ/canonicalize-vars.ll (+108-94) 
- (added) llvm/test/CodeGen/SystemZ/fp-half-vector-args.ll (+711) 
- (added) llvm/test/CodeGen/SystemZ/fp-half-vector-binops.ll (+519) 
- (added) llvm/test/CodeGen/SystemZ/fp-half-vector-conv.ll (+178) 
- (added) llvm/test/CodeGen/SystemZ/fp-half-vector-fcmp-select.ll (+503) 
- (added) llvm/test/CodeGen/SystemZ/fp-half-vector-mem.ll (+149) 
- (removed) llvm/test/CodeGen/SystemZ/fp-half-vector.ll (-725) 


``````````diff

diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index 2795de5eeeb66..69202e3fcbc57 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -50,7 +50,7 @@ def RetCC_SystemZ_ELF : CallingConv<[
   // Sub-128 vectors are returned in the same way, but they're widened
   // to one of these types during type legalization.
   CCIfSubtarget<"hasVector()",
-    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+    CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
              CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>>
 ]>;
 
@@ -116,19 +116,19 @@ def CC_SystemZ_ELF : CallingConv<[
   // are passed in the same way, but they're widened to one of these types
   // during type legalization.
   CCIfSubtarget<"hasVector()",
-    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+    CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
              CCIfArgFixed<CCAssignToReg<[V24, V26, V28, V30,
                                          V25, V27, V29, V31]>>>>,
 
   // However, sub-128 vectors which need to go on the stack occupy just a
   // single 8-byte-aligned 8-byte stack slot.  Pass as i64.
   CCIfSubtarget<"hasVector()",
-    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+    CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
              CCIfShortVector<CCBitConvertToType<i64>>>>,
 
   // Other vector arguments are passed in 8-byte-aligned 16-byte stack slots.
   CCIfSubtarget<"hasVector()",
-    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+    CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
              CCAssignToStack<16, 8>>>,
 
   // Other arguments are passed in 8-byte-aligned 8-byte stack slots.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 2511d08a6d0ef..aaa6c22eaf01a 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -123,6 +123,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
       addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass);
       addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass);
       addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass);
+      addRegisterClass(MVT::v8f16, &SystemZ::VR128BitRegClass);
       addRegisterClass(MVT::v4f32, &SystemZ::VR128BitRegClass);
       addRegisterClass(MVT::v2f64, &SystemZ::VR128BitRegClass);
     }
@@ -620,6 +621,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   // Handle floating-point vector types.
   if (Subtarget.hasVector()) {
     // Scalar-to-vector conversion is just a subreg.
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);
     setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
     setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
 
@@ -627,6 +629,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     // need to go via integers.
     setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f16, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
 
@@ -842,6 +845,33 @@ bool SystemZTargetLowering::useSoftFloat() const {
   return Subtarget.hasSoftFloat();
 }
 
+unsigned SystemZTargetLowering::getNumRegisters(LLVMContext &Context, EVT VT,
+                                          std::optional<MVT> RegisterVT) const {
+  // i128 inline assembly operand.
+  if (VT == MVT::i128 && RegisterVT && *RegisterVT == MVT::Untyped)
+    return 1;
+  // Pass narrow fp16 vectors per the ABI even though they are generally
+  // expanded.
+  if (Subtarget.hasVector() && VT.isVector() && VT.getScalarType() == MVT::f16)
+    return divideCeil(VT.getVectorNumElements(), SystemZ::VectorBytes / 2);
+  return TargetLowering::getNumRegisters(Context, VT);
+}
+
+MVT SystemZTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+                                                         CallingConv::ID CC,
+                                                         EVT VT) const {
+  // 128-bit single-element vector types are passed like other vectors,
+  // not like their element type.
+  if (VT.isVector() && VT.getSizeInBits() == 128 &&
+      VT.getVectorNumElements() == 1)
+    return MVT::v16i8;
+  // Pass narrow fp16 vectors per the ABI even though they are generally
+  // expanded.
+  if (Subtarget.hasVector() && VT.isVector() && VT.getScalarType() == MVT::f16)
+    return MVT::v8f16;
+  return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+}
+
 EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL,
                                               LLVMContext &, EVT VT) const {
   if (!VT.isVector())
@@ -2051,6 +2081,7 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
       case MVT::v8i16:
       case MVT::v4i32:
       case MVT::v2i64:
+      case MVT::v8f16:
       case MVT::v4f32:
       case MVT::v2f64:
         RC = &SystemZ::VR128BitRegClass;
@@ -6351,6 +6382,37 @@ bool SystemZTargetLowering::isVectorElementLoad(SDValue Op) const {
   return false;
 }
 
+static SDValue mergeHighParts(SelectionDAG &DAG, const SDLoc &DL,
+                              unsigned MergedBits, EVT VT, SDValue Op0,
+                              SDValue Op1) {
+  MVT IntVecVT = MVT::getVectorVT(MVT::getIntegerVT(MergedBits),
+                                  SystemZ::VectorBits / MergedBits);
+  assert(VT.getSizeInBits() == 128 && IntVecVT.getSizeInBits() == 128 &&
+         "Handling full vectors only.");
+  Op0 = DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0);
+  Op1 = DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op1);
+  SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH,
+                           DL, IntVecVT, Op0, Op1);
+  return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+}
+
+static SDValue buildFPVecFromScalars4(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
+                                      SmallVectorImpl<SDValue> &Elems,
+                                      unsigned Pos) {
+  SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[Pos + 0], Elems[Pos + 1]);
+  SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[Pos + 2], Elems[Pos + 3]);
+  // Avoid unnecessary undefs by reusing the other operand.
+  if (Op01.isUndef())
+    Op01 = Op23;
+  else if (Op23.isUndef())
+    Op23 = Op01;
+  // Merging identical replications is a no-op.
+  if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23)
+    return Op01;
+  unsigned MergedBits = VT.getSimpleVT().getScalarSizeInBits() * 2;
+  return mergeHighParts(DAG, DL, MergedBits, VT, Op01, Op23);
+}
+
 // Combine GPR scalar values Elems into a vector of type VT.
 SDValue
 SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
@@ -6409,22 +6471,17 @@ SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
   //      <ABxx>         <CDxx>
   //                V                 VMRHG
   //              <ABCD>
-  if (VT == MVT::v4f32 && !AllLoads) {
-    SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
-    SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[2], Elems[3]);
-    // Avoid unnecessary undefs by reusing the other operand.
-    if (Op01.isUndef())
-      Op01 = Op23;
-    else if (Op23.isUndef())
-      Op23 = Op01;
+  if (VT == MVT::v4f32 && !AllLoads)
+    return buildFPVecFromScalars4(DAG, DL, VT, Elems, 0);
+
+  // Same for v8i16.
+  if (VT == MVT::v8f16 && !AllLoads) {
+    SDValue Op0123 = buildFPVecFromScalars4(DAG, DL, VT, Elems, 0);
+    SDValue Op4567 = buildFPVecFromScalars4(DAG, DL, VT, Elems, 4);
     // Merging identical replications is a no-op.
-    if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23)
-      return Op01;
-    Op01 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op01);
-    Op23 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op23);
-    SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH,
-                             DL, MVT::v2i64, Op01, Op23);
-    return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+    if (Op0123.getOpcode() == SystemZISD::REPLICATE && Op0123 == Op4567)
+      return Op0123;
+    return mergeHighParts(DAG, DL, 64, VT, Op0123, Op4567);
   }
 
   // Collect the constant terms.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 13a1cd1614a53..ca47b96ef2d80 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -64,27 +64,19 @@ class SystemZTargetLowering : public TargetLowering {
     //
     // (c) there are no multiplication instructions for the widest integer
     //     type (v2i64).
+
+    // Expand (narrow) f16 vectors during type legalization to avoid
+    // operations for all elements as with expansion after widening.
+    if (VT.getScalarType() == MVT::f16)
+      return VT.getVectorElementCount().isScalar() ? TypeScalarizeVector : TypeSplitVector;
     if (VT.getScalarSizeInBits() % 8 == 0)
       return TypeWidenVector;
     return TargetLoweringBase::getPreferredVectorAction(VT);
   }
-  unsigned
-  getNumRegisters(LLVMContext &Context, EVT VT,
-                  std::optional<MVT> RegisterVT) const override {
-    // i128 inline assembly operand.
-    if (VT == MVT::i128 && RegisterVT && *RegisterVT == MVT::Untyped)
-      return 1;
-    return TargetLowering::getNumRegisters(Context, VT);
-  }
+  unsigned getNumRegisters(LLVMContext &Context, EVT VT,
+                           std::optional<MVT> RegisterVT) const override;
   MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
-                                    EVT VT) const override {
-    // 128-bit single-element vector types are passed like other vectors,
-    // not like their element type.
-    if (VT.isVector() && VT.getSizeInBits() == 128 &&
-        VT.getVectorNumElements() == 1)
-      return MVT::v16i8;
-    return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
-  }
+                                    EVT VT) const override;
   bool isCheapToSpeculateCtlz(Type *) const override { return true; }
   bool isCheapToSpeculateCttz(Type *) const override { return true; }
   bool preferZeroCompareBranch() const override { return true; }
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index 479bab5ce62b8..3eb66d06cc16d 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -348,6 +348,7 @@ let Predicates = [FeatureVector] in {
   def VMRHH : BinaryVRRc<"vmrhh", 0xE761, z_merge_high, v128h, v128h, 1>;
   def VMRHF : BinaryVRRc<"vmrhf", 0xE761, z_merge_high, v128f, v128f, 2>;
   def VMRHG : BinaryVRRc<"vmrhg", 0xE761, z_merge_high, v128g, v128g, 3>;
+  def : BinaryRRWithType<VMRHH, VR128, z_merge_high, v8f16>;
   def : BinaryRRWithType<VMRHF, VR128, z_merge_high, v4f32>;
   def : BinaryRRWithType<VMRHG, VR128, z_merge_high, v2f64>;
 
@@ -357,6 +358,7 @@ let Predicates = [FeatureVector] in {
   def VMRLH : BinaryVRRc<"vmrlh", 0xE760, z_merge_low, v128h, v128h, 1>;
   def VMRLF : BinaryVRRc<"vmrlf", 0xE760, z_merge_low, v128f, v128f, 2>;
   def VMRLG : BinaryVRRc<"vmrlg", 0xE760, z_merge_low, v128g, v128g, 3>;
+  def : BinaryRRWithType<VMRLH, VR128, z_merge_low, v8f16>;
   def : BinaryRRWithType<VMRLF, VR128, z_merge_low, v4f32>;
   def : BinaryRRWithType<VMRLG, VR128, z_merge_low, v2f64>;
 
@@ -497,6 +499,7 @@ defm : GenericVectorOps<v16i8, v16i8>;
 defm : GenericVectorOps<v8i16, v8i16>;
 defm : GenericVectorOps<v4i32, v4i32>;
 defm : GenericVectorOps<v2i64, v2i64>;
+defm : GenericVectorOps<v8f16, v8i16>;
 defm : GenericVectorOps<v4f32, v4i32>;
 defm : GenericVectorOps<v2f64, v2i64>;
 
@@ -2110,6 +2113,7 @@ def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
 def : Pat<(v16i8 (bitconvert (i128  VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8f16 VR128:$src))), (v16i8 VR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
 def : Pat<(v16i8 (bitconvert (f128  VR128:$src))), (v16i8 VR128:$src)>;
@@ -2118,6 +2122,7 @@ def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
 def : Pat<(v8i16 (bitconvert (i128  VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v8f16 VR128:$src))), (v8i16 VR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
 def : Pat<(v8i16 (bitconvert (f128  VR128:$src))), (v8i16 VR128:$src)>;
@@ -2126,6 +2131,7 @@ def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
 def : Pat<(v4i32 (bitconvert (i128  VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8f16 VR128:$src))), (v4i32 VR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
 def : Pat<(v4i32 (bitconvert (f128  VR128:$src))), (v4i32 VR128:$src)>;
@@ -2134,15 +2140,26 @@ def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
 def : Pat<(v2i64 (bitconvert (i128  VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8f16 VR128:$src))), (v2i64 VR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
 def : Pat<(v2i64 (bitconvert (f128  VR128:$src))), (v2i64 VR128:$src)>;
 
+def : Pat<(v8f16 (bitconvert (v16i8 VR128:$src))), (v8f16 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v8i16 VR128:$src))), (v8f16 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v4i32 VR128:$src))), (v8f16 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v2i64 VR128:$src))), (v8f16 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (i128  VR128:$src))), (v8f16 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v4f32 VR128:$src))), (v8f16 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v2f64 VR128:$src))), (v8f16 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (f128  VR128:$src))), (v8f16 VR128:$src)>;
+
 def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
 def : Pat<(v4f32 (bitconvert (i128  VR128:$src))), (v4f32 VR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8f16 VR128:$src))), (v4f32 VR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
 def : Pat<(v4f32 (bitconvert (f128  VR128:$src))), (v4f32 VR128:$src)>;
 
@@ -2151,6 +2168,7 @@ def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (i128  VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8f16 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (f128  VR128:$src))), (v2f64 VR128:$src)>;
 
@@ -2159,6 +2177,7 @@ def : Pat<(f128  (bitconvert (v8i16 VR128:$src))), (f128  VR128:$src)>;
 def : Pat<(f128  (bitconvert (v4i32 VR128:$src))), (f128  VR128:$src)>;
 def : Pat<(f128  (bitconvert (v2i64 VR128:$src))), (f128  VR128:$src)>;
 def : Pat<(f128  (bitconvert (i128  VR128:$src))), (f128  VR128:$src)>;
+def : Pat<(f128  (bitconvert (v8f16 VR128:$src))), (f128  VR128:$src)>;
 def : Pat<(f128  (bitconvert (v4f32 VR128:$src))), (f128  VR128:$src)>;
 def : Pat<(f128  (bitconvert (v2f64 VR128:$src))), (f128  VR128:$src)>;
 
@@ -2166,6 +2185,7 @@ def : Pat<(i128  (bitconvert (v16i8 VR128:$src))), (i128  VR128:$src)>;
 def : Pat<(i128  (bitconvert (v8i16 VR128:$src))), (i128  VR128:$src)>;
 def : Pat<(i128  (bitconvert (v4i32 VR128:$src))), (i128  VR128:$src)>;
 def : Pat<(i128  (bitconvert (v2i64 VR128:$src))), (i128  VR128:$src)>;
+def : Pat<(i128  (bitconvert (v8f16 VR128:$src))), (i128  VR128:$src)>;
 def : Pat<(i128  (bitconvert (v4f32 VR128:$src))), (i128  VR128:$src)>;
 def : Pat<(i128  (bitconvert (v2f64 VR128:$src))), (i128  VR128:$src)>;
 def : Pat<(i128  (bitconvert (f128  VR128:$src))), (i128  VR128:$src)>;
@@ -2216,6 +2236,7 @@ multiclass ScalarToVectorFP<Instruction vrep, ValueType vt, RegisterOperand cls,
             (vrep (INSERT_SUBREG (vt (IMPLICIT_DEF)), cls:$scalar,
                                  subreg), 0)>;
 }
+defm : ScalarToVectorFP<VREPH, v8f16, FP16, subreg_h16>;
 defm : ScalarToVectorFP<VREPF, v4f32, FP32, subreg_h32>;
 defm : ScalarToVectorFP<VREPG, v2f64, FP64, subreg_h64>;
 
@@ -2236,6 +2257,11 @@ let AddedComplexity = 4 in {
 // 3 added by TableGen for the base register operand in VLGV-based integer
 // extractions and ensures that this version is strictly better.
 let AddedComplexity = 4 in {
+  def : Pat<(f16 (z_vector_extract (v8f16 VR128:$vec), 0)),
+            (EXTRACT_SUBREG VR128:$vec, subreg_h16)>;
+  def : Pat<(f16 (z_vector_extract (v8f16 VR128:$vec), imm32zx3:$index)),
+            (EXTRACT_SUBREG (VREPH VR128:$vec, imm32zx2:$index), subreg_h16)>;
+
   def : Pat<(f32 (z_vector_extract (v4f32 VR128:$vec), 0)),
             (EXTRACT_SUBREG VR128:$vec, subreg_h32)>;
   def : Pat<(f32 (z_vector_extract (v4f32 VR128:$vec), imm32zx2:$index)),
diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
index e79f12b449a88..1ef8e81c8f829 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -305,13 +305,13 @@ defm VR64 : SystemZRegClass<"VR64", [f64, v8i8, v4i16, v2i32, v2f32], 64,
 // The subset of vector registers that can be used for floating-point
 // operations too.
 defm VF128 : SystemZRegClass<"VF128",
-                             [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
-                             (sequence "V%u", 0, 15)>;
+                             [v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
+                             128, (sequence "V%u", 0, 15)>;
 
 // All vector registers.
 defm VR128 : SystemZRegClass<"VR128",
                              [v16i8, v8i16, v4i32, v2i64, i128,
-                              v4f32, v2f64, f128],
+                              v8f16, v4f32, v2f64, f128],
                              128, (add (sequence "V%u", 0, 7),
                                        (sequence "V%u", 16, 31),
                                        (sequence "V%u", 8, 15))>;
diff --git a/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll b/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll
index e02f931c4d31e..d0f3414e89497 100644
--- a/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll
+++ b/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll
@@ -111,87 +111,93 @@ define void @canonicalize_ptr_f128(ptr %out) {
 define <8 x half> @canonicalize_v8f16(<8 x half> %a) nounwind {
 ; Z16-LABEL: canonicalize_v8f16:
 ; Z16:       # %bb.0:
-; Z16-NEXT:    stmg %r13, %r15, 104(%r15)
+; Z16-NEXT:    stmg %r14, %r15, 112(%r15)
 ; Z16-NEXT:    aghi %r15, -224
-; Z16-NEXT:    std %f8, 216(%r15) # 8-byte Spill
-; Z16-NEXT:    std %f9, 208(%r15) # 8-byte Spill
-; Z16-NEXT:    std %f10, 200(%r15) # 8-byte Spill
-; Z16-NEXT:    std %f11, 192(%r15) # 8-byte Spill
-; Z16-NEXT:    std %f12, 184(%r15) # 8-byte Spill
-; Z16-NEXT:    std %f13, 17...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/171066