[llvm] [SystemZ] Support fp16 vector ABI and basic codegen. (PR #171066)
Jonas Paulsson via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 8 08:47:07 PST 2025
https://github.com/JonPsson1 updated https://github.com/llvm/llvm-project/pull/171066
>From 390fbdad322e323484ace8796d4a63323603da32 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Fri, 28 Nov 2025 03:44:48 +0100
Subject: [PATCH 1/4] Support for f16 vectors, first try.
---
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 +
llvm/lib/Target/SystemZ/SystemZCallingConv.td | 8 +-
.../Target/SystemZ/SystemZISelLowering.cpp | 67 +-
llvm/lib/Target/SystemZ/SystemZInstrVector.td | 26 +
.../lib/Target/SystemZ/SystemZRegisterInfo.td | 6 +-
.../test/CodeGen/SystemZ/canonicalize-vars.ll | 202 ++--
.../CodeGen/SystemZ/fp-half-vector-args.ll | 639 +++++++++++++
.../CodeGen/SystemZ/fp-half-vector-binops.ll | 888 ++++++++++++++++++
.../SystemZ/fp-half-vector-conversions.ll | 2 +
.../SystemZ/fp-half-vector-fcmp-vsel.ll | 118 +++
.../CodeGen/SystemZ/fp-half-vector-mem.ll | 145 +++
llvm/test/CodeGen/SystemZ/fp-half-vector.ll | 318 +++----
12 files changed, 2138 insertions(+), 283 deletions(-)
create mode 100644 llvm/test/CodeGen/SystemZ/fp-half-vector-args.ll
create mode 100644 llvm/test/CodeGen/SystemZ/fp-half-vector-binops.ll
create mode 100644 llvm/test/CodeGen/SystemZ/fp-half-vector-conversions.ll
create mode 100644 llvm/test/CodeGen/SystemZ/fp-half-vector-fcmp-vsel.ll
create mode 100644 llvm/test/CodeGen/SystemZ/fp-half-vector-mem.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b009e6a3d5f5f..9a8743cf44b85 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7432,6 +7432,8 @@ SDValue SelectionDAG::foldConstantFPMath(unsigned Opcode, const SDLoc &DL,
case ISD::FREM:
// If both operands are undef, the result is undef. If 1 operand is undef,
// the result is NaN. This should match the behavior of the IR optimizer.
+ // XXX What if the other operand will become undef later: NaN + undef
+ // => undef?
if (N1.isUndef() && N2.isUndef())
return getUNDEF(VT);
if (N1.isUndef() || N2.isUndef())
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index 2795de5eeeb66..69202e3fcbc57 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -50,7 +50,7 @@ def RetCC_SystemZ_ELF : CallingConv<[
// Sub-128 vectors are returned in the same way, but they're widened
// to one of these types during type legalization.
CCIfSubtarget<"hasVector()",
- CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>>
]>;
@@ -116,19 +116,19 @@ def CC_SystemZ_ELF : CallingConv<[
// are passed in the same way, but they're widened to one of these types
// during type legalization.
CCIfSubtarget<"hasVector()",
- CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
CCIfArgFixed<CCAssignToReg<[V24, V26, V28, V30,
V25, V27, V29, V31]>>>>,
// However, sub-128 vectors which need to go on the stack occupy just a
// single 8-byte-aligned 8-byte stack slot. Pass as i64.
CCIfSubtarget<"hasVector()",
- CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
CCIfShortVector<CCBitConvertToType<i64>>>>,
// Other vector arguments are passed in 8-byte-aligned 16-byte stack slots.
CCIfSubtarget<"hasVector()",
- CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
CCAssignToStack<16, 8>>>,
// Other arguments are passed in 8-byte-aligned 8-byte stack slots.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 2511d08a6d0ef..de2e018680f5b 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -123,6 +123,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass);
addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass);
addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass);
+ addRegisterClass(MVT::v8f16, &SystemZ::VR128BitRegClass);
addRegisterClass(MVT::v4f32, &SystemZ::VR128BitRegClass);
addRegisterClass(MVT::v2f64, &SystemZ::VR128BitRegClass);
}
@@ -620,6 +621,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
// Handle floating-point vector types.
if (Subtarget.hasVector()) {
// Scalar-to-vector conversion is just a subreg.
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
@@ -627,6 +629,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
// need to go via integers.
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
@@ -2051,6 +2054,7 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
case MVT::v8i16:
case MVT::v4i32:
case MVT::v2i64:
+ case MVT::v8f16:
case MVT::v4f32:
case MVT::v2f64:
RC = &SystemZ::VR128BitRegClass;
@@ -6351,6 +6355,37 @@ bool SystemZTargetLowering::isVectorElementLoad(SDValue Op) const {
return false;
}
+static SDValue mergeHighParts(SelectionDAG &DAG, const SDLoc &DL,
+ unsigned MergedBits, EVT VT, SDValue Op0,
+ SDValue Op1) {
+ MVT IntVecVT = MVT::getVectorVT(MVT::getIntegerVT(MergedBits),
+ SystemZ::VectorBits / MergedBits);
+ assert(VT.getSizeInBits() == 128 && IntVecVT.getSizeInBits() == 128 &&
+ "Handling full vectors only.");
+ Op0 = DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0);
+ Op1 = DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op1);
+ SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH,
+ DL, IntVecVT, Op0, Op1);
+ return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+}
+
+static SDValue buildFPVecFromScalars4(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
+ SmallVectorImpl<SDValue> &Elems,
+ unsigned Pos) {
+ SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[Pos + 0], Elems[Pos + 1]);
+ SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[Pos + 2], Elems[Pos + 3]);
+ // Avoid unnecessary undefs by reusing the other operand.
+ if (Op01.isUndef())
+ Op01 = Op23;
+ else if (Op23.isUndef())
+ Op23 = Op01;
+ // Merging identical replications is a no-op.
+ if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23)
+ return Op01;
+ unsigned MergedBits = VT.getSimpleVT().getScalarSizeInBits() * 2;
+ return mergeHighParts(DAG, DL, MergedBits, VT, Op01, Op23);
+}
+
// Combine GPR scalar values Elems into a vector of type VT.
SDValue
SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
@@ -6409,22 +6444,17 @@ SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
// <ABxx> <CDxx>
// V VMRHG
// <ABCD>
- if (VT == MVT::v4f32 && !AllLoads) {
- SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
- SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[2], Elems[3]);
- // Avoid unnecessary undefs by reusing the other operand.
- if (Op01.isUndef())
- Op01 = Op23;
- else if (Op23.isUndef())
- Op23 = Op01;
+ if (VT == MVT::v4f32 && !AllLoads)
+ return buildFPVecFromScalars4(DAG, DL, VT, Elems, 0);
+
+ // Same for v8i16.
+ if (VT == MVT::v8f16 && !AllLoads) {
+ SDValue Op0123 = buildFPVecFromScalars4(DAG, DL, VT, Elems, 0);
+ SDValue Op4567 = buildFPVecFromScalars4(DAG, DL, VT, Elems, 4);
// Merging identical replications is a no-op.
- if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23)
- return Op01;
- Op01 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op01);
- Op23 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op23);
- SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH,
- DL, MVT::v2i64, Op01, Op23);
- return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+ if (Op0123.getOpcode() == SystemZISD::REPLICATE && Op0123 == Op4567)
+ return Op0123;
+ return mergeHighParts(DAG, DL, 64, VT, Op0123, Op4567);
}
// Collect the constant terms.
@@ -7555,6 +7585,13 @@ SDValue SystemZTargetLowering::combineExtract(const SDLoc &DL, EVT ResVT,
Op = Op.getOperand(0);
Index = Byte / BytesPerElement;
Force = true;
+ } else if (Opcode == ISD::SCALAR_TO_VECTOR && ResVT == MVT::f16) {
+ // The vector was first widened and then expanded. Expose undef
+ // elements to eliminate the unneeded operations.
+ EVT OpVT = Op.getValueType();
+ if (Index * ResVT.getScalarSizeInBits() >= OpVT.getScalarSizeInBits())
+ return DAG.getUNDEF(ResVT);
+ break;
} else
break;
}
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index 479bab5ce62b8..3eb66d06cc16d 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -348,6 +348,7 @@ let Predicates = [FeatureVector] in {
def VMRHH : BinaryVRRc<"vmrhh", 0xE761, z_merge_high, v128h, v128h, 1>;
def VMRHF : BinaryVRRc<"vmrhf", 0xE761, z_merge_high, v128f, v128f, 2>;
def VMRHG : BinaryVRRc<"vmrhg", 0xE761, z_merge_high, v128g, v128g, 3>;
+ def : BinaryRRWithType<VMRHH, VR128, z_merge_high, v8f16>;
def : BinaryRRWithType<VMRHF, VR128, z_merge_high, v4f32>;
def : BinaryRRWithType<VMRHG, VR128, z_merge_high, v2f64>;
@@ -357,6 +358,7 @@ let Predicates = [FeatureVector] in {
def VMRLH : BinaryVRRc<"vmrlh", 0xE760, z_merge_low, v128h, v128h, 1>;
def VMRLF : BinaryVRRc<"vmrlf", 0xE760, z_merge_low, v128f, v128f, 2>;
def VMRLG : BinaryVRRc<"vmrlg", 0xE760, z_merge_low, v128g, v128g, 3>;
+ def : BinaryRRWithType<VMRLH, VR128, z_merge_low, v8f16>;
def : BinaryRRWithType<VMRLF, VR128, z_merge_low, v4f32>;
def : BinaryRRWithType<VMRLG, VR128, z_merge_low, v2f64>;
@@ -497,6 +499,7 @@ defm : GenericVectorOps<v16i8, v16i8>;
defm : GenericVectorOps<v8i16, v8i16>;
defm : GenericVectorOps<v4i32, v4i32>;
defm : GenericVectorOps<v2i64, v2i64>;
+defm : GenericVectorOps<v8f16, v8i16>;
defm : GenericVectorOps<v4f32, v4i32>;
defm : GenericVectorOps<v2f64, v2i64>;
@@ -2110,6 +2113,7 @@ def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
def : Pat<(v16i8 (bitconvert (i128 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8f16 VR128:$src))), (v16i8 VR128:$src)>;
def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
def : Pat<(v16i8 (bitconvert (f128 VR128:$src))), (v16i8 VR128:$src)>;
@@ -2118,6 +2122,7 @@ def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
def : Pat<(v8i16 (bitconvert (i128 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v8f16 VR128:$src))), (v8i16 VR128:$src)>;
def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
def : Pat<(v8i16 (bitconvert (f128 VR128:$src))), (v8i16 VR128:$src)>;
@@ -2126,6 +2131,7 @@ def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
def : Pat<(v4i32 (bitconvert (i128 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8f16 VR128:$src))), (v4i32 VR128:$src)>;
def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
def : Pat<(v4i32 (bitconvert (f128 VR128:$src))), (v4i32 VR128:$src)>;
@@ -2134,15 +2140,26 @@ def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
def : Pat<(v2i64 (bitconvert (i128 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8f16 VR128:$src))), (v2i64 VR128:$src)>;
def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
def : Pat<(v2i64 (bitconvert (f128 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v16i8 VR128:$src))), (v8f16 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v8i16 VR128:$src))), (v8f16 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v4i32 VR128:$src))), (v8f16 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v2i64 VR128:$src))), (v8f16 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (i128 VR128:$src))), (v8f16 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v4f32 VR128:$src))), (v8f16 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v2f64 VR128:$src))), (v8f16 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (f128 VR128:$src))), (v8f16 VR128:$src)>;
+
def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
def : Pat<(v4f32 (bitconvert (i128 VR128:$src))), (v4f32 VR128:$src)>;
def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8f16 VR128:$src))), (v4f32 VR128:$src)>;
def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
def : Pat<(v4f32 (bitconvert (f128 VR128:$src))), (v4f32 VR128:$src)>;
@@ -2151,6 +2168,7 @@ def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (i128 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8f16 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (f128 VR128:$src))), (v2f64 VR128:$src)>;
@@ -2159,6 +2177,7 @@ def : Pat<(f128 (bitconvert (v8i16 VR128:$src))), (f128 VR128:$src)>;
def : Pat<(f128 (bitconvert (v4i32 VR128:$src))), (f128 VR128:$src)>;
def : Pat<(f128 (bitconvert (v2i64 VR128:$src))), (f128 VR128:$src)>;
def : Pat<(f128 (bitconvert (i128 VR128:$src))), (f128 VR128:$src)>;
+def : Pat<(f128 (bitconvert (v8f16 VR128:$src))), (f128 VR128:$src)>;
def : Pat<(f128 (bitconvert (v4f32 VR128:$src))), (f128 VR128:$src)>;
def : Pat<(f128 (bitconvert (v2f64 VR128:$src))), (f128 VR128:$src)>;
@@ -2166,6 +2185,7 @@ def : Pat<(i128 (bitconvert (v16i8 VR128:$src))), (i128 VR128:$src)>;
def : Pat<(i128 (bitconvert (v8i16 VR128:$src))), (i128 VR128:$src)>;
def : Pat<(i128 (bitconvert (v4i32 VR128:$src))), (i128 VR128:$src)>;
def : Pat<(i128 (bitconvert (v2i64 VR128:$src))), (i128 VR128:$src)>;
+def : Pat<(i128 (bitconvert (v8f16 VR128:$src))), (i128 VR128:$src)>;
def : Pat<(i128 (bitconvert (v4f32 VR128:$src))), (i128 VR128:$src)>;
def : Pat<(i128 (bitconvert (v2f64 VR128:$src))), (i128 VR128:$src)>;
def : Pat<(i128 (bitconvert (f128 VR128:$src))), (i128 VR128:$src)>;
@@ -2216,6 +2236,7 @@ multiclass ScalarToVectorFP<Instruction vrep, ValueType vt, RegisterOperand cls,
(vrep (INSERT_SUBREG (vt (IMPLICIT_DEF)), cls:$scalar,
subreg), 0)>;
}
+defm : ScalarToVectorFP<VREPH, v8f16, FP16, subreg_h16>;
defm : ScalarToVectorFP<VREPF, v4f32, FP32, subreg_h32>;
defm : ScalarToVectorFP<VREPG, v2f64, FP64, subreg_h64>;
@@ -2236,6 +2257,11 @@ let AddedComplexity = 4 in {
// 3 added by TableGen for the base register operand in VLGV-based integer
// extractions and ensures that this version is strictly better.
let AddedComplexity = 4 in {
+ def : Pat<(f16 (z_vector_extract (v8f16 VR128:$vec), 0)),
+ (EXTRACT_SUBREG VR128:$vec, subreg_h16)>;
+ def : Pat<(f16 (z_vector_extract (v8f16 VR128:$vec), imm32zx3:$index)),
+ (EXTRACT_SUBREG (VREPH VR128:$vec, imm32zx2:$index), subreg_h16)>;
+
def : Pat<(f32 (z_vector_extract (v4f32 VR128:$vec), 0)),
(EXTRACT_SUBREG VR128:$vec, subreg_h32)>;
def : Pat<(f32 (z_vector_extract (v4f32 VR128:$vec), imm32zx2:$index)),
diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
index e79f12b449a88..1ef8e81c8f829 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -305,13 +305,13 @@ defm VR64 : SystemZRegClass<"VR64", [f64, v8i8, v4i16, v2i32, v2f32], 64,
// The subset of vector registers that can be used for floating-point
// operations too.
defm VF128 : SystemZRegClass<"VF128",
- [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
- (sequence "V%u", 0, 15)>;
+ [v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
+ 128, (sequence "V%u", 0, 15)>;
// All vector registers.
defm VR128 : SystemZRegClass<"VR128",
[v16i8, v8i16, v4i32, v2i64, i128,
- v4f32, v2f64, f128],
+ v8f16, v4f32, v2f64, f128],
128, (add (sequence "V%u", 0, 7),
(sequence "V%u", 16, 31),
(sequence "V%u", 8, 15))>;
diff --git a/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll b/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll
index e02f931c4d31e..d0f3414e89497 100644
--- a/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll
+++ b/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll
@@ -111,87 +111,93 @@ define void @canonicalize_ptr_f128(ptr %out) {
define <8 x half> @canonicalize_v8f16(<8 x half> %a) nounwind {
; Z16-LABEL: canonicalize_v8f16:
; Z16: # %bb.0:
-; Z16-NEXT: stmg %r13, %r15, 104(%r15)
+; Z16-NEXT: stmg %r14, %r15, 112(%r15)
; Z16-NEXT: aghi %r15, -224
-; Z16-NEXT: std %f8, 216(%r15) # 8-byte Spill
-; Z16-NEXT: std %f9, 208(%r15) # 8-byte Spill
-; Z16-NEXT: std %f10, 200(%r15) # 8-byte Spill
-; Z16-NEXT: std %f11, 192(%r15) # 8-byte Spill
-; Z16-NEXT: std %f12, 184(%r15) # 8-byte Spill
-; Z16-NEXT: std %f13, 176(%r15) # 8-byte Spill
-; Z16-NEXT: std %f14, 168(%r15) # 8-byte Spill
-; Z16-NEXT: std %f15, 160(%r15) # 8-byte Spill
-; Z16-NEXT: vlreph %v11, 414(%r15)
-; Z16-NEXT: vlreph %v12, 406(%r15)
-; Z16-NEXT: vlreph %v13, 398(%r15)
-; Z16-NEXT: vlreph %v14, 390(%r15)
-; Z16-NEXT: ldr %f8, %f6
-; Z16-NEXT: ldr %f9, %f4
-; Z16-NEXT: ldr %f10, %f2
-; Z16-NEXT: lgr %r13, %r2
+; Z16-NEXT: vst %v24, 160(%r15), 3 # 16-byte Spill
+; Z16-NEXT: vreph %v0, %v24, 7
+; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0
; Z16-NEXT: brasl %r14, __extendhfsf2 at PLT
; Z16-NEXT: vgmf %v1, 2, 8
; Z16-NEXT: meebr %f0, %f1
; Z16-NEXT: brasl %r14, __truncsfhf2 at PLT
-; Z16-NEXT: ldr %f15, %f0
-; Z16-NEXT: ldr %f0, %f10
+; Z16-NEXT: # kill: def $f0h killed $f0h def $v0
+; Z16-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
+; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; Z16-NEXT: vreph %v0, %v0, 6
+; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0
; Z16-NEXT: brasl %r14, __extendhfsf2 at PLT
; Z16-NEXT: vgmf %v1, 2, 8
; Z16-NEXT: meebr %f0, %f1
; Z16-NEXT: brasl %r14, __truncsfhf2 at PLT
-; Z16-NEXT: ldr %f10, %f0
-; Z16-NEXT: ldr %f0, %f9
+; Z16-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
+; Z16-NEXT: # kill: def $f0h killed $f0h def $v0
+; Z16-NEXT: vmrhh %v0, %v0, %v1
+; Z16-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
+; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; Z16-NEXT: vreph %v0, %v0, 5
+; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0
; Z16-NEXT: brasl %r14, __extendhfsf2 at PLT
; Z16-NEXT: vgmf %v1, 2, 8
; Z16-NEXT: meebr %f0, %f1
; Z16-NEXT: brasl %r14, __truncsfhf2 at PLT
-; Z16-NEXT: ldr %f9, %f0
-; Z16-NEXT: ldr %f0, %f8
+; Z16-NEXT: # kill: def $f0h killed $f0h def $v0
+; Z16-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
+; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; Z16-NEXT: vreph %v0, %v0, 4
+; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0
; Z16-NEXT: brasl %r14, __extendhfsf2 at PLT
; Z16-NEXT: vgmf %v1, 2, 8
; Z16-NEXT: meebr %f0, %f1
; Z16-NEXT: brasl %r14, __truncsfhf2 at PLT
-; Z16-NEXT: ldr %f8, %f0
-; Z16-NEXT: ldr %f0, %f14
+; Z16-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
+; Z16-NEXT: # kill: def $f0h killed $f0h def $v0
+; Z16-NEXT: vmrhh %v0, %v0, %v1
+; Z16-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
+; Z16-NEXT: vmrhf %v0, %v0, %v1
+; Z16-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
+; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; Z16-NEXT: vreph %v0, %v0, 3
+; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0
; Z16-NEXT: brasl %r14, __extendhfsf2 at PLT
; Z16-NEXT: vgmf %v1, 2, 8
; Z16-NEXT: meebr %f0, %f1
; Z16-NEXT: brasl %r14, __truncsfhf2 at PLT
-; Z16-NEXT: ldr %f14, %f0
-; Z16-NEXT: ldr %f0, %f13
+; Z16-NEXT: # kill: def $f0h killed $f0h def $v0
+; Z16-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
+; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; Z16-NEXT: vreph %v0, %v0, 2
+; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0
; Z16-NEXT: brasl %r14, __extendhfsf2 at PLT
; Z16-NEXT: vgmf %v1, 2, 8
; Z16-NEXT: meebr %f0, %f1
; Z16-NEXT: brasl %r14, __truncsfhf2 at PLT
-; Z16-NEXT: ldr %f13, %f0
-; Z16-NEXT: ldr %f0, %f12
+; Z16-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
+; Z16-NEXT: # kill: def $f0h killed $f0h def $v0
+; Z16-NEXT: vmrhh %v0, %v0, %v1
+; Z16-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
+; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0
; Z16-NEXT: brasl %r14, __extendhfsf2 at PLT
; Z16-NEXT: vgmf %v1, 2, 8
; Z16-NEXT: meebr %f0, %f1
; Z16-NEXT: brasl %r14, __truncsfhf2 at PLT
-; Z16-NEXT: ldr %f12, %f0
-; Z16-NEXT: ldr %f0, %f11
+; Z16-NEXT: # kill: def $f0h killed $f0h def $v0
+; Z16-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill
+; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; Z16-NEXT: vreph %v0, %v0, 1
+; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0
; Z16-NEXT: brasl %r14, __extendhfsf2 at PLT
; Z16-NEXT: vgmf %v1, 2, 8
; Z16-NEXT: meebr %f0, %f1
; Z16-NEXT: brasl %r14, __truncsfhf2 at PLT
-; Z16-NEXT: vsteh %v0, 14(%r13), 0
-; Z16-NEXT: vsteh %v12, 12(%r13), 0
-; Z16-NEXT: vsteh %v13, 10(%r13), 0
-; Z16-NEXT: vsteh %v14, 8(%r13), 0
-; Z16-NEXT: vsteh %v8, 6(%r13), 0
-; Z16-NEXT: vsteh %v9, 4(%r13), 0
-; Z16-NEXT: vsteh %v10, 2(%r13), 0
-; Z16-NEXT: vsteh %v15, 0(%r13), 0
-; Z16-NEXT: ld %f8, 216(%r15) # 8-byte Reload
-; Z16-NEXT: ld %f9, 208(%r15) # 8-byte Reload
-; Z16-NEXT: ld %f10, 200(%r15) # 8-byte Reload
-; Z16-NEXT: ld %f11, 192(%r15) # 8-byte Reload
-; Z16-NEXT: ld %f12, 184(%r15) # 8-byte Reload
-; Z16-NEXT: ld %f13, 176(%r15) # 8-byte Reload
-; Z16-NEXT: ld %f14, 168(%r15) # 8-byte Reload
-; Z16-NEXT: ld %f15, 160(%r15) # 8-byte Reload
-; Z16-NEXT: lmg %r13, %r15, 328(%r15)
+; Z16-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload
+; Z16-NEXT: # kill: def $f0h killed $f0h def $v0
+; Z16-NEXT: vmrhh %v0, %v1, %v0
+; Z16-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
+; Z16-NEXT: vmrhf %v0, %v0, %v1
+; Z16-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
+; Z16-NEXT: vmrhg %v24, %v0, %v1
+; Z16-NEXT: lmg %r14, %r15, 336(%r15)
; Z16-NEXT: br %r14
%canonicalized = call <8 x half> @llvm.canonicalize.v8f16(<8 x half> %a)
ret <8 x half> %canonicalized
@@ -253,85 +259,93 @@ define void @canonicalize_ptr_v8f16(ptr %out) nounwind {
; Z16: # %bb.0:
; Z16-NEXT: stmg %r13, %r15, 104(%r15)
; Z16-NEXT: aghi %r15, -224
-; Z16-NEXT: std %f8, 216(%r15) # 8-byte Spill
-; Z16-NEXT: std %f9, 208(%r15) # 8-byte Spill
-; Z16-NEXT: std %f10, 200(%r15) # 8-byte Spill
-; Z16-NEXT: std %f11, 192(%r15) # 8-byte Spill
-; Z16-NEXT: std %f12, 184(%r15) # 8-byte Spill
-; Z16-NEXT: std %f13, 176(%r15) # 8-byte Spill
-; Z16-NEXT: std %f14, 168(%r15) # 8-byte Spill
-; Z16-NEXT: std %f15, 160(%r15) # 8-byte Spill
-; Z16-NEXT: vlreph %v0, 0(%r2)
-; Z16-NEXT: vlreph %v8, 14(%r2)
-; Z16-NEXT: vlreph %v9, 12(%r2)
-; Z16-NEXT: vlreph %v10, 10(%r2)
+; Z16-NEXT: vl %v0, 0(%r2), 3
; Z16-NEXT: lgr %r13, %r2
-; Z16-NEXT: vlreph %v11, 8(%r2)
-; Z16-NEXT: vlreph %v12, 6(%r2)
-; Z16-NEXT: vlreph %v13, 4(%r2)
-; Z16-NEXT: vlreph %v14, 2(%r2)
+; Z16-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill
+; Z16-NEXT: vreph %v0, %v0, 7
+; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0
; Z16-NEXT: brasl %r14, __extendhfsf2 at PLT
; Z16-NEXT: vgmf %v1, 2, 8
; Z16-NEXT: meebr %f0, %f1
; Z16-NEXT: brasl %r14, __truncsfhf2 at PLT
-; Z16-NEXT: ldr %f15, %f0
-; Z16-NEXT: ldr %f0, %f14
+; Z16-NEXT: # kill: def $f0h killed $f0h def $v0
+; Z16-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
+; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; Z16-NEXT: vreph %v0, %v0, 6
+; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0
; Z16-NEXT: brasl %r14, __extendhfsf2 at PLT
; Z16-NEXT: vgmf %v1, 2, 8
; Z16-NEXT: meebr %f0, %f1
; Z16-NEXT: brasl %r14, __truncsfhf2 at PLT
-; Z16-NEXT: ldr %f14, %f0
-; Z16-NEXT: ldr %f0, %f13
+; Z16-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
+; Z16-NEXT: # kill: def $f0h killed $f0h def $v0
+; Z16-NEXT: vmrhh %v0, %v0, %v1
+; Z16-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
+; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; Z16-NEXT: vreph %v0, %v0, 5
+; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0
; Z16-NEXT: brasl %r14, __extendhfsf2 at PLT
; Z16-NEXT: vgmf %v1, 2, 8
; Z16-NEXT: meebr %f0, %f1
; Z16-NEXT: brasl %r14, __truncsfhf2 at PLT
-; Z16-NEXT: ldr %f13, %f0
-; Z16-NEXT: ldr %f0, %f12
+; Z16-NEXT: # kill: def $f0h killed $f0h def $v0
+; Z16-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
+; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; Z16-NEXT: vreph %v0, %v0, 4
+; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0
; Z16-NEXT: brasl %r14, __extendhfsf2 at PLT
; Z16-NEXT: vgmf %v1, 2, 8
; Z16-NEXT: meebr %f0, %f1
; Z16-NEXT: brasl %r14, __truncsfhf2 at PLT
-; Z16-NEXT: ldr %f12, %f0
-; Z16-NEXT: ldr %f0, %f11
+; Z16-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
+; Z16-NEXT: # kill: def $f0h killed $f0h def $v0
+; Z16-NEXT: vmrhh %v0, %v0, %v1
+; Z16-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
+; Z16-NEXT: vmrhf %v0, %v0, %v1
+; Z16-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
+; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; Z16-NEXT: vreph %v0, %v0, 3
+; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0
; Z16-NEXT: brasl %r14, __extendhfsf2 at PLT
; Z16-NEXT: vgmf %v1, 2, 8
; Z16-NEXT: meebr %f0, %f1
; Z16-NEXT: brasl %r14, __truncsfhf2 at PLT
-; Z16-NEXT: ldr %f11, %f0
-; Z16-NEXT: ldr %f0, %f10
+; Z16-NEXT: # kill: def $f0h killed $f0h def $v0
+; Z16-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
+; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; Z16-NEXT: vreph %v0, %v0, 2
+; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0
; Z16-NEXT: brasl %r14, __extendhfsf2 at PLT
; Z16-NEXT: vgmf %v1, 2, 8
; Z16-NEXT: meebr %f0, %f1
; Z16-NEXT: brasl %r14, __truncsfhf2 at PLT
-; Z16-NEXT: ldr %f10, %f0
-; Z16-NEXT: ldr %f0, %f9
+; Z16-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
+; Z16-NEXT: # kill: def $f0h killed $f0h def $v0
+; Z16-NEXT: vmrhh %v0, %v0, %v1
+; Z16-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
+; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0
; Z16-NEXT: brasl %r14, __extendhfsf2 at PLT
; Z16-NEXT: vgmf %v1, 2, 8
; Z16-NEXT: meebr %f0, %f1
; Z16-NEXT: brasl %r14, __truncsfhf2 at PLT
-; Z16-NEXT: ldr %f9, %f0
-; Z16-NEXT: ldr %f0, %f8
+; Z16-NEXT: # kill: def $f0h killed $f0h def $v0
+; Z16-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill
+; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; Z16-NEXT: vreph %v0, %v0, 1
+; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0
; Z16-NEXT: brasl %r14, __extendhfsf2 at PLT
; Z16-NEXT: vgmf %v1, 2, 8
; Z16-NEXT: meebr %f0, %f1
; Z16-NEXT: brasl %r14, __truncsfhf2 at PLT
-; Z16-NEXT: vsteh %v9, 12(%r13), 0
-; Z16-NEXT: vsteh %v10, 10(%r13), 0
-; Z16-NEXT: vsteh %v11, 8(%r13), 0
-; Z16-NEXT: vsteh %v12, 6(%r13), 0
-; Z16-NEXT: vsteh %v13, 4(%r13), 0
-; Z16-NEXT: vsteh %v14, 2(%r13), 0
-; Z16-NEXT: vsteh %v15, 0(%r13), 0
-; Z16-NEXT: ld %f8, 216(%r15) # 8-byte Reload
-; Z16-NEXT: ld %f9, 208(%r15) # 8-byte Reload
-; Z16-NEXT: ld %f10, 200(%r15) # 8-byte Reload
-; Z16-NEXT: ld %f11, 192(%r15) # 8-byte Reload
-; Z16-NEXT: ld %f12, 184(%r15) # 8-byte Reload
-; Z16-NEXT: ld %f13, 176(%r15) # 8-byte Reload
-; Z16-NEXT: ld %f14, 168(%r15) # 8-byte Reload
-; Z16-NEXT: ld %f15, 160(%r15) # 8-byte Reload
-; Z16-NEXT: vsteh %v0, 14(%r13), 0
+; Z16-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload
+; Z16-NEXT: # kill: def $f0h killed $f0h def $v0
+; Z16-NEXT: vmrhh %v0, %v1, %v0
+; Z16-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
+; Z16-NEXT: vmrhf %v0, %v0, %v1
+; Z16-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
+; Z16-NEXT: vmrhg %v0, %v0, %v1
+; Z16-NEXT: vst %v0, 0(%r13), 3
; Z16-NEXT: lmg %r13, %r15, 328(%r15)
; Z16-NEXT: br %r14
%val = load <8 x half>, ptr %out
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-args.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-args.ll
new file mode 100644
index 0000000000000..aee9161bd29ae
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-args.ll
@@ -0,0 +1,639 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=VECTOR
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=SCALAR
+
+; Function argument in vector register.
+declare void @foo0(<8 x half>)
+define void @fun0(<8 x half> %A, ptr %Src, ptr %Dst) {
+; VECTOR-LABEL: fun0:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: vst %v24, 0(%r3), 3
+; VECTOR-NEXT: vl %v24, 0(%r2), 3
+; VECTOR-NEXT: brasl %r14, foo0 at PLT
+; VECTOR-NEXT: lmg %r14, %r15, 272(%r15)
+; VECTOR-NEXT: br %r14
+;
+; SCALAR-LABEL: fun0:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: stmg %r14, %r15, 112(%r15)
+; SCALAR-NEXT: .cfi_offset %r14, -48
+; SCALAR-NEXT: .cfi_offset %r15, -40
+; SCALAR-NEXT: aghi %r15, -192
+; SCALAR-NEXT: .cfi_def_cfa_offset 352
+; SCALAR-NEXT: lgh %r0, 382(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f1, %r0
+; SCALAR-NEXT: lgh %r0, 374(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f3, %r0
+; SCALAR-NEXT: # kill: def $f0h killed $f0h def $f0d
+; SCALAR-NEXT: # kill: def $f2h killed $f2h def $f2d
+; SCALAR-NEXT: # kill: def $f4h killed $f4h def $f4d
+; SCALAR-NEXT: # kill: def $f6h killed $f6h def $f6d
+; SCALAR-NEXT: lgh %r0, 366(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f5, %r0
+; SCALAR-NEXT: lgh %r0, 358(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f7, %r0
+; SCALAR-NEXT: lgdr %r0, %f0
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 0(%r3)
+; SCALAR-NEXT: lgdr %r0, %f7
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 8(%r3)
+; SCALAR-NEXT: lgdr %r0, %f2
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 2(%r3)
+; SCALAR-NEXT: lgdr %r0, %f5
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 10(%r3)
+; SCALAR-NEXT: lgdr %r0, %f4
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 4(%r3)
+; SCALAR-NEXT: lgdr %r0, %f3
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 12(%r3)
+; SCALAR-NEXT: lgdr %r0, %f6
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 6(%r3)
+; SCALAR-NEXT: lgdr %r0, %f1
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 14(%r3)
+; SCALAR-NEXT: lgh %r0, 0(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f0, %r0
+; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; SCALAR-NEXT: lgh %r0, 2(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f2, %r0
+; SCALAR-NEXT: # kill: def $f2h killed $f2h killed $f2d
+; SCALAR-NEXT: lgh %r0, 4(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f4, %r0
+; SCALAR-NEXT: # kill: def $f4h killed $f4h killed $f4d
+; SCALAR-NEXT: lgh %r0, 6(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f6, %r0
+; SCALAR-NEXT: # kill: def $f6h killed $f6h killed $f6d
+; SCALAR-NEXT: lgh %r0, 8(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f1, %r0
+; SCALAR-NEXT: lgh %r0, 10(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f3, %r0
+; SCALAR-NEXT: lgh %r0, 12(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f5, %r0
+; SCALAR-NEXT: lgh %r0, 14(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f7, %r0
+; SCALAR-NEXT: lgdr %r0, %f7
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 190(%r15)
+; SCALAR-NEXT: lgdr %r0, %f5
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 182(%r15)
+; SCALAR-NEXT: lgdr %r0, %f3
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 174(%r15)
+; SCALAR-NEXT: lgdr %r0, %f1
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 166(%r15)
+; SCALAR-NEXT: brasl %r14, foo0 at PLT
+; SCALAR-NEXT: lmg %r14, %r15, 304(%r15)
+; SCALAR-NEXT: br %r14
+ store <8 x half> %A, ptr %Dst
+ %L = load <8 x half>, ptr %Src
+ call void @foo0(<8 x half> %L)
+ ret void
+}
+
+declare void @foo1(<4 x half>)
+define void @fun1(<4 x half> %A, ptr %Src, ptr %Dst) {
+; VECTOR-LABEL: fun1:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: vsteg %v24, 0(%r3), 0
+; VECTOR-NEXT: vlrepg %v24, 0(%r2)
+; VECTOR-NEXT: brasl %r14, foo0 at PLT
+; VECTOR-NEXT: lmg %r14, %r15, 272(%r15)
+; VECTOR-NEXT: br %r14
+;
+; SCALAR-LABEL: fun1:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: stmg %r14, %r15, 112(%r15)
+; SCALAR-NEXT: .cfi_offset %r14, -48
+; SCALAR-NEXT: .cfi_offset %r15, -40
+; SCALAR-NEXT: aghi %r15, -160
+; SCALAR-NEXT: .cfi_def_cfa_offset 320
+; SCALAR-NEXT: # kill: def $f0h killed $f0h def $f0d
+; SCALAR-NEXT: lgdr %r0, %f0
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: # kill: def $f2h killed $f2h def $f2d
+; SCALAR-NEXT: sth %r0, 0(%r3)
+; SCALAR-NEXT: lgdr %r0, %f2
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: # kill: def $f4h killed $f4h def $f4d
+; SCALAR-NEXT: sth %r0, 2(%r3)
+; SCALAR-NEXT: # kill: def $f6h killed $f6h def $f6d
+; SCALAR-NEXT: lgdr %r0, %f4
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 4(%r3)
+; SCALAR-NEXT: lgdr %r0, %f6
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 6(%r3)
+; SCALAR-NEXT: lgh %r0, 0(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f0, %r0
+; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; SCALAR-NEXT: lgh %r0, 2(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f2, %r0
+; SCALAR-NEXT: # kill: def $f2h killed $f2h killed $f2d
+; SCALAR-NEXT: lgh %r0, 4(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f4, %r0
+; SCALAR-NEXT: # kill: def $f4h killed $f4h killed $f4d
+; SCALAR-NEXT: lgh %r0, 6(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f6, %r0
+; SCALAR-NEXT: # kill: def $f6h killed $f6h killed $f6d
+; SCALAR-NEXT: brasl %r14, foo0 at PLT
+; SCALAR-NEXT: lmg %r14, %r15, 272(%r15)
+; SCALAR-NEXT: br %r14
+ store <4 x half> %A, ptr %Dst
+ %L = load <4 x half>, ptr %Src
+ call void @foo0(<4 x half> %L)
+ ret void
+}
+
+declare void @foo2(<16 x half>)
+define void @fun2(<16 x half> %A, ptr %Src, ptr %Dst) {
+; VECTOR-LABEL: fun2:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: vst %v24, 0(%r3), 4
+; VECTOR-NEXT: vst %v26, 16(%r3), 4
+; VECTOR-NEXT: vl %v24, 0(%r2), 4
+; VECTOR-NEXT: vl %v26, 16(%r2), 4
+; VECTOR-NEXT: brasl %r14, foo0 at PLT
+; VECTOR-NEXT: lmg %r14, %r15, 272(%r15)
+; VECTOR-NEXT: br %r14
+;
+; SCALAR-LABEL: fun2:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: stmg %r14, %r15, 112(%r15)
+; SCALAR-NEXT: .cfi_offset %r14, -48
+; SCALAR-NEXT: .cfi_offset %r15, -40
+; SCALAR-NEXT: aghi %r15, -320
+; SCALAR-NEXT: .cfi_def_cfa_offset 480
+; SCALAR-NEXT: std %f8, 312(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f9, 304(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f10, 296(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f11, 288(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f12, 280(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f13, 272(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f14, 264(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f15, 256(%r15) # 8-byte Spill
+; SCALAR-NEXT: .cfi_offset %f8, -168
+; SCALAR-NEXT: .cfi_offset %f9, -176
+; SCALAR-NEXT: .cfi_offset %f10, -184
+; SCALAR-NEXT: .cfi_offset %f11, -192
+; SCALAR-NEXT: .cfi_offset %f12, -200
+; SCALAR-NEXT: .cfi_offset %f13, -208
+; SCALAR-NEXT: .cfi_offset %f14, -216
+; SCALAR-NEXT: .cfi_offset %f15, -224
+; SCALAR-NEXT: # kill: def $f0h killed $f0h def $f0d
+; SCALAR-NEXT: # kill: def $f2h killed $f2h def $f2d
+; SCALAR-NEXT: # kill: def $f4h killed $f4h def $f4d
+; SCALAR-NEXT: # kill: def $f6h killed $f6h def $f6d
+; SCALAR-NEXT: lgh %r0, 574(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f1, %r0
+; SCALAR-NEXT: lgh %r0, 566(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f3, %r0
+; SCALAR-NEXT: lgh %r0, 558(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f5, %r0
+; SCALAR-NEXT: lgh %r0, 550(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f7, %r0
+; SCALAR-NEXT: lgh %r0, 542(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f8, %r0
+; SCALAR-NEXT: lgh %r0, 534(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f9, %r0
+; SCALAR-NEXT: lgh %r0, 526(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f10, %r0
+; SCALAR-NEXT: lgh %r0, 518(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f11, %r0
+; SCALAR-NEXT: lgh %r0, 510(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f12, %r0
+; SCALAR-NEXT: lgh %r0, 502(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f13, %r0
+; SCALAR-NEXT: lgh %r0, 494(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f14, %r0
+; SCALAR-NEXT: lgh %r0, 486(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f15, %r0
+; SCALAR-NEXT: lgdr %r0, %f0
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 0(%r3)
+; SCALAR-NEXT: lgdr %r0, %f2
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 2(%r3)
+; SCALAR-NEXT: lgdr %r0, %f4
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 4(%r3)
+; SCALAR-NEXT: lgdr %r0, %f6
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 6(%r3)
+; SCALAR-NEXT: lgdr %r0, %f15
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 8(%r3)
+; SCALAR-NEXT: lgdr %r0, %f14
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 10(%r3)
+; SCALAR-NEXT: lgdr %r0, %f13
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 12(%r3)
+; SCALAR-NEXT: lgdr %r0, %f12
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 14(%r3)
+; SCALAR-NEXT: lgdr %r0, %f11
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 16(%r3)
+; SCALAR-NEXT: lgdr %r0, %f10
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 18(%r3)
+; SCALAR-NEXT: lgdr %r0, %f9
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 20(%r3)
+; SCALAR-NEXT: lgdr %r0, %f8
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 22(%r3)
+; SCALAR-NEXT: lgdr %r0, %f7
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 24(%r3)
+; SCALAR-NEXT: lgdr %r0, %f5
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 26(%r3)
+; SCALAR-NEXT: lgdr %r0, %f3
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 28(%r3)
+; SCALAR-NEXT: lgdr %r0, %f1
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 30(%r3)
+; SCALAR-NEXT: lgh %r0, 0(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f0, %r0
+; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; SCALAR-NEXT: lgh %r0, 2(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f2, %r0
+; SCALAR-NEXT: # kill: def $f2h killed $f2h killed $f2d
+; SCALAR-NEXT: lgh %r0, 4(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f4, %r0
+; SCALAR-NEXT: # kill: def $f4h killed $f4h killed $f4d
+; SCALAR-NEXT: lgh %r0, 6(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f6, %r0
+; SCALAR-NEXT: # kill: def $f6h killed $f6h killed $f6d
+; SCALAR-NEXT: lgh %r0, 8(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f1, %r0
+; SCALAR-NEXT: lgh %r0, 10(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f3, %r0
+; SCALAR-NEXT: lgh %r0, 12(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f5, %r0
+; SCALAR-NEXT: lgh %r0, 14(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f7, %r0
+; SCALAR-NEXT: lgh %r0, 16(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f8, %r0
+; SCALAR-NEXT: lgh %r0, 18(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f9, %r0
+; SCALAR-NEXT: lgh %r0, 20(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f10, %r0
+; SCALAR-NEXT: lgh %r0, 22(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f11, %r0
+; SCALAR-NEXT: lgh %r0, 24(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f12, %r0
+; SCALAR-NEXT: lgh %r0, 26(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f13, %r0
+; SCALAR-NEXT: lgh %r0, 28(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f14, %r0
+; SCALAR-NEXT: lgh %r0, 30(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f15, %r0
+; SCALAR-NEXT: lgdr %r0, %f15
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 254(%r15)
+; SCALAR-NEXT: lgdr %r0, %f14
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 246(%r15)
+; SCALAR-NEXT: lgdr %r0, %f13
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 238(%r15)
+; SCALAR-NEXT: lgdr %r0, %f12
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 230(%r15)
+; SCALAR-NEXT: lgdr %r0, %f11
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 222(%r15)
+; SCALAR-NEXT: lgdr %r0, %f10
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 214(%r15)
+; SCALAR-NEXT: lgdr %r0, %f9
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 206(%r15)
+; SCALAR-NEXT: lgdr %r0, %f8
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 198(%r15)
+; SCALAR-NEXT: lgdr %r0, %f7
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 190(%r15)
+; SCALAR-NEXT: lgdr %r0, %f5
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 182(%r15)
+; SCALAR-NEXT: lgdr %r0, %f3
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 174(%r15)
+; SCALAR-NEXT: lgdr %r0, %f1
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 166(%r15)
+; SCALAR-NEXT: brasl %r14, foo0 at PLT
+; SCALAR-NEXT: ld %f8, 312(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f9, 304(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f10, 296(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f11, 288(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f12, 280(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f13, 272(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f14, 264(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f15, 256(%r15) # 8-byte Reload
+; SCALAR-NEXT: lmg %r14, %r15, 432(%r15)
+; SCALAR-NEXT: br %r14
+ store <16 x half> %A, ptr %Dst
+ %L = load <16 x half>, ptr %Src
+ call void @foo0(<16 x half> %L)
+ ret void
+}
+
+; Return in vector register.
+declare <8 x half> @foo3()
+define <8 x half> @fun3(ptr %Src, ptr %Dst) {
+; VECTOR-LABEL: fun3:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r12, %r15, 96(%r15)
+; VECTOR-NEXT: .cfi_offset %r12, -64
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: lgr %r12, %r2
+; VECTOR-NEXT: brasl %r14, foo3 at PLT
+; VECTOR-NEXT: vst %v24, 0(%r13), 3
+; VECTOR-NEXT: vl %v24, 0(%r12), 3
+; VECTOR-NEXT: lmg %r12, %r15, 256(%r15)
+; VECTOR-NEXT: br %r14
+;
+; SCALAR-LABEL: fun3:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: stmg %r11, %r15, 88(%r15)
+; SCALAR-NEXT: .cfi_offset %r11, -72
+; SCALAR-NEXT: .cfi_offset %r12, -64
+; SCALAR-NEXT: .cfi_offset %r13, -56
+; SCALAR-NEXT: .cfi_offset %r14, -48
+; SCALAR-NEXT: .cfi_offset %r15, -40
+; SCALAR-NEXT: aghi %r15, -176
+; SCALAR-NEXT: .cfi_def_cfa_offset 336
+; SCALAR-NEXT: lgr %r13, %r2
+; SCALAR-NEXT: la %r2, 160(%r15)
+; SCALAR-NEXT: lgr %r11, %r4
+; SCALAR-NEXT: lgr %r12, %r3
+; SCALAR-NEXT: brasl %r14, foo3 at PLT
+; SCALAR-NEXT: lgh %r0, 160(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f0, %r0
+; SCALAR-NEXT: lgh %r0, 162(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f1, %r0
+; SCALAR-NEXT: lgh %r0, 164(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f2, %r0
+; SCALAR-NEXT: lgh %r0, 166(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f3, %r0
+; SCALAR-NEXT: lgh %r0, 168(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f4, %r0
+; SCALAR-NEXT: lgh %r0, 170(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f5, %r0
+; SCALAR-NEXT: lgh %r0, 172(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f6, %r0
+; SCALAR-NEXT: lgh %r0, 174(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f7, %r0
+; SCALAR-NEXT: lgdr %r0, %f7
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 14(%r11)
+; SCALAR-NEXT: lgdr %r0, %f6
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 12(%r11)
+; SCALAR-NEXT: lgdr %r0, %f5
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 10(%r11)
+; SCALAR-NEXT: lgdr %r0, %f4
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 8(%r11)
+; SCALAR-NEXT: lgdr %r0, %f3
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 6(%r11)
+; SCALAR-NEXT: lgdr %r0, %f2
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 4(%r11)
+; SCALAR-NEXT: lgdr %r0, %f1
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 2(%r11)
+; SCALAR-NEXT: lgdr %r0, %f0
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 0(%r11)
+; SCALAR-NEXT: lg %r0, 0(%r12)
+; SCALAR-NEXT: lg %r1, 8(%r12)
+; SCALAR-NEXT: stg %r1, 8(%r13)
+; SCALAR-NEXT: stg %r0, 0(%r13)
+; SCALAR-NEXT: lmg %r11, %r15, 264(%r15)
+; SCALAR-NEXT: br %r14
+ %V = call <8 x half> @foo3()
+ store <8 x half> %V, ptr %Dst
+ %L = load <8 x half>, ptr %Src
+ ret <8 x half> %L
+}
+
+declare <4 x half> @foo4()
+define <4 x half> @fun4(ptr %Src, ptr %Dst) {
+; VECTOR-LABEL: fun4:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r12, %r15, 96(%r15)
+; VECTOR-NEXT: .cfi_offset %r12, -64
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: lgr %r12, %r2
+; VECTOR-NEXT: brasl %r14, foo4 at PLT
+; VECTOR-NEXT: vsteg %v24, 0(%r13), 0
+; VECTOR-NEXT: vlrepg %v24, 0(%r12)
+; VECTOR-NEXT: lmg %r12, %r15, 256(%r15)
+; VECTOR-NEXT: br %r14
+;
+; SCALAR-LABEL: fun4:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: stmg %r12, %r15, 96(%r15)
+; SCALAR-NEXT: .cfi_offset %r12, -64
+; SCALAR-NEXT: .cfi_offset %r13, -56
+; SCALAR-NEXT: .cfi_offset %r14, -48
+; SCALAR-NEXT: .cfi_offset %r15, -40
+; SCALAR-NEXT: aghi %r15, -160
+; SCALAR-NEXT: .cfi_def_cfa_offset 320
+; SCALAR-NEXT: lgr %r12, %r3
+; SCALAR-NEXT: lgr %r13, %r2
+; SCALAR-NEXT: brasl %r14, foo4 at PLT
+; SCALAR-NEXT: # kill: def $f0h killed $f0h def $f0d
+; SCALAR-NEXT: lgdr %r0, %f0
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: # kill: def $f2h killed $f2h def $f2d
+; SCALAR-NEXT: sth %r0, 0(%r12)
+; SCALAR-NEXT: lgdr %r0, %f2
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: # kill: def $f4h killed $f4h def $f4d
+; SCALAR-NEXT: sth %r0, 2(%r12)
+; SCALAR-NEXT: lgdr %r0, %f4
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: # kill: def $f6h killed $f6h def $f6d
+; SCALAR-NEXT: sth %r0, 4(%r12)
+; SCALAR-NEXT: lgdr %r0, %f6
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 6(%r12)
+; SCALAR-NEXT: lgh %r0, 0(%r13)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f0, %r0
+; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; SCALAR-NEXT: lgh %r0, 2(%r13)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f2, %r0
+; SCALAR-NEXT: # kill: def $f2h killed $f2h killed $f2d
+; SCALAR-NEXT: lgh %r0, 4(%r13)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f4, %r0
+; SCALAR-NEXT: # kill: def $f4h killed $f4h killed $f4d
+; SCALAR-NEXT: lgh %r0, 6(%r13)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f6, %r0
+; SCALAR-NEXT: # kill: def $f6h killed $f6h killed $f6d
+; SCALAR-NEXT: lmg %r12, %r15, 256(%r15)
+; SCALAR-NEXT: br %r14
+ %V = call <4 x half> @foo4()
+ store <4 x half> %V, ptr %Dst
+ %L = load <4 x half>, ptr %Src
+ ret <4 x half> %L
+}
+
+declare <16 x half> @foo5()
+define <16 x half> @fun5(ptr %Src, ptr %Dst) {
+; VECTOR-LABEL: fun5:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r12, %r15, 96(%r15)
+; VECTOR-NEXT: .cfi_offset %r12, -64
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: lgr %r12, %r2
+; VECTOR-NEXT: brasl %r14, foo5 at PLT
+; VECTOR-NEXT: vst %v24, 0(%r13), 4
+; VECTOR-NEXT: vst %v26, 16(%r13), 4
+; VECTOR-NEXT: vl %v24, 0(%r12), 4
+; VECTOR-NEXT: vl %v26, 16(%r12), 4
+; VECTOR-NEXT: lmg %r12, %r15, 256(%r15)
+; VECTOR-NEXT: br %r14
+;
+; SCALAR-LABEL: fun5:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: stmg %r11, %r15, 88(%r15)
+; SCALAR-NEXT: .cfi_offset %r11, -72
+; SCALAR-NEXT: .cfi_offset %r12, -64
+; SCALAR-NEXT: .cfi_offset %r13, -56
+; SCALAR-NEXT: .cfi_offset %r14, -48
+; SCALAR-NEXT: .cfi_offset %r15, -40
+; SCALAR-NEXT: aghi %r15, -192
+; SCALAR-NEXT: .cfi_def_cfa_offset 352
+; SCALAR-NEXT: lgr %r11, %r2
+; SCALAR-NEXT: la %r2, 160(%r15)
+; SCALAR-NEXT: lgr %r13, %r4
+; SCALAR-NEXT: lgr %r12, %r3
+; SCALAR-NEXT: brasl %r14, foo5 at PLT
+; SCALAR-NEXT: lg %r0, 160(%r15)
+; SCALAR-NEXT: lg %r1, 168(%r15)
+; SCALAR-NEXT: lg %r2, 176(%r15)
+; SCALAR-NEXT: lg %r3, 184(%r15)
+; SCALAR-NEXT: stg %r3, 24(%r13)
+; SCALAR-NEXT: stg %r2, 16(%r13)
+; SCALAR-NEXT: stg %r1, 8(%r13)
+; SCALAR-NEXT: stg %r0, 0(%r13)
+; SCALAR-NEXT: lg %r0, 24(%r12)
+; SCALAR-NEXT: lg %r1, 16(%r12)
+; SCALAR-NEXT: lg %r2, 8(%r12)
+; SCALAR-NEXT: lg %r3, 0(%r12)
+; SCALAR-NEXT: stg %r3, 0(%r11)
+; SCALAR-NEXT: stg %r2, 8(%r11)
+; SCALAR-NEXT: stg %r1, 16(%r11)
+; SCALAR-NEXT: stg %r0, 24(%r11)
+; SCALAR-NEXT: lmg %r11, %r15, 280(%r15)
+; SCALAR-NEXT: br %r14
+ %V = call <16 x half> @foo5()
+ store <16 x half> %V, ptr %Dst
+ %L = load <16 x half>, ptr %Src
+ ret <16 x half> %L
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-binops.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-binops.ll
new file mode 100644
index 0000000000000..ad0a5cac5cc08
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-binops.ll
@@ -0,0 +1,888 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=VECTOR
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=SCALAR
+
+; Scalarized operations, full vector.
+define <8 x half> @fun0(<8 x half> %LHS, <8 x half> %RHS) {
+; VECTOR-LABEL: fun0:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -248
+; VECTOR-NEXT: .cfi_def_cfa_offset 408
+; VECTOR-NEXT: std %f8, 240(%r15) # 8-byte Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: vst %v26, 176(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vst %v24, 160(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vreph %v0, %v26, 7
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 7
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 6
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 6
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v0, %v1
+; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 5
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 5
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 4
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 4
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v0, %v1
+; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vmrhf %v0, %v0, %v1
+; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 3
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 3
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 2
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 2
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v0, %v1
+; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vst %v0, 224(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 1
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 1
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: vl %v1, 224(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v1, %v0
+; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vmrhf %v0, %v0, %v1
+; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: ld %f8, 240(%r15) # 8-byte Reload
+; VECTOR-NEXT: vmrhg %v24, %v0, %v1
+; VECTOR-NEXT: lmg %r14, %r15, 360(%r15)
+; VECTOR-NEXT: br %r14
+;
+; SCALAR-LABEL: fun0:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: stmg %r13, %r15, 104(%r15)
+; SCALAR-NEXT: .cfi_offset %r13, -56
+; SCALAR-NEXT: .cfi_offset %r14, -48
+; SCALAR-NEXT: .cfi_offset %r15, -40
+; SCALAR-NEXT: aghi %r15, -288
+; SCALAR-NEXT: .cfi_def_cfa_offset 448
+; SCALAR-NEXT: std %f8, 280(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f9, 272(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f10, 264(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f11, 256(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f12, 248(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f13, 240(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f14, 232(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f15, 224(%r15) # 8-byte Spill
+; SCALAR-NEXT: .cfi_offset %f8, -168
+; SCALAR-NEXT: .cfi_offset %f9, -176
+; SCALAR-NEXT: .cfi_offset %f10, -184
+; SCALAR-NEXT: .cfi_offset %f11, -192
+; SCALAR-NEXT: .cfi_offset %f12, -200
+; SCALAR-NEXT: .cfi_offset %f13, -208
+; SCALAR-NEXT: .cfi_offset %f14, -216
+; SCALAR-NEXT: .cfi_offset %f15, -224
+; SCALAR-NEXT: lgh %r0, 478(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: stg %r0, 216(%r15) # 8-byte Spill
+; SCALAR-NEXT: lgh %r0, 542(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: stg %r0, 208(%r15) # 8-byte Spill
+; SCALAR-NEXT: lgh %r0, 470(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: stg %r0, 192(%r15) # 8-byte Spill
+; SCALAR-NEXT: lgh %r0, 534(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: stg %r0, 184(%r15) # 8-byte Spill
+; SCALAR-NEXT: lgh %r0, 462(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: stg %r0, 176(%r15) # 8-byte Spill
+; SCALAR-NEXT: lgh %r0, 526(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: stg %r0, 168(%r15) # 8-byte Spill
+; SCALAR-NEXT: lgh %r0, 454(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f10, %r0
+; SCALAR-NEXT: lgh %r0, 518(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f11, %r0
+; SCALAR-NEXT: lgh %r0, 510(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f14, %r0
+; SCALAR-NEXT: lgh %r0, 502(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f15, %r0
+; SCALAR-NEXT: lgh %r0, 494(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f8, %r0
+; SCALAR-NEXT: lgh %r0, 486(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ste %f6, 164(%r15) # 4-byte Spill
+; SCALAR-NEXT: ste %f4, 160(%r15) # 4-byte Spill
+; SCALAR-NEXT: ler %f13, %f2
+; SCALAR-NEXT: ler %f12, %f0
+; SCALAR-NEXT: lgr %r13, %r2
+; SCALAR-NEXT: ldgr %f0, %r0
+; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ler %f9, %f0
+; SCALAR-NEXT: ler %f0, %f12
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: aebr %f0, %f9
+; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; SCALAR-NEXT: # kill: def $f0h killed $f0h def $f0d
+; SCALAR-NEXT: std %f0, 200(%r15) # 8-byte Spill
+; SCALAR-NEXT: ler %f0, %f8
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ler %f8, %f0
+; SCALAR-NEXT: ler %f0, %f13
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: aebr %f0, %f8
+; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; SCALAR-NEXT: ler %f13, %f0
+; SCALAR-NEXT: ler %f0, %f15
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ler %f8, %f0
+; SCALAR-NEXT: le %f0, 160(%r15) # 4-byte Reload
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: aebr %f0, %f8
+; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; SCALAR-NEXT: ler %f9, %f0
+; SCALAR-NEXT: ler %f0, %f14
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ler %f8, %f0
+; SCALAR-NEXT: le %f0, 164(%r15) # 4-byte Reload
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: aebr %f0, %f8
+; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; SCALAR-NEXT: ler %f14, %f0
+; SCALAR-NEXT: ler %f0, %f11
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ler %f8, %f0
+; SCALAR-NEXT: ler %f0, %f10
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: aebr %f0, %f8
+; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; SCALAR-NEXT: ler %f10, %f0
+; SCALAR-NEXT: ld %f0, 168(%r15) # 8-byte Reload
+; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ler %f8, %f0
+; SCALAR-NEXT: ld %f0, 176(%r15) # 8-byte Reload
+; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: aebr %f0, %f8
+; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; SCALAR-NEXT: ler %f8, %f0
+; SCALAR-NEXT: ld %f0, 184(%r15) # 8-byte Reload
+; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ler %f11, %f0
+; SCALAR-NEXT: ld %f0, 192(%r15) # 8-byte Reload
+; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: aebr %f0, %f11
+; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; SCALAR-NEXT: ler %f11, %f0
+; SCALAR-NEXT: ld %f0, 208(%r15) # 8-byte Reload
+; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ler %f12, %f0
+; SCALAR-NEXT: ld %f0, 216(%r15) # 8-byte Reload
+; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: aebr %f0, %f12
+; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; SCALAR-NEXT: # kill: def $f0h killed $f0h def $f0d
+; SCALAR-NEXT: lgdr %r0, %f0
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 14(%r13)
+; SCALAR-NEXT: lgdr %r0, %f11
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 12(%r13)
+; SCALAR-NEXT: lgdr %r0, %f8
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 10(%r13)
+; SCALAR-NEXT: lgdr %r0, %f10
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 8(%r13)
+; SCALAR-NEXT: lgdr %r0, %f14
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 6(%r13)
+; SCALAR-NEXT: lgdr %r0, %f9
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 4(%r13)
+; SCALAR-NEXT: lgdr %r0, %f13
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 2(%r13)
+; SCALAR-NEXT: lg %r0, 200(%r15) # 8-byte Reload
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 0(%r13)
+; SCALAR-NEXT: ld %f8, 280(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f9, 272(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f10, 264(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f11, 256(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f12, 248(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f13, 240(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f14, 232(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f15, 224(%r15) # 8-byte Reload
+; SCALAR-NEXT: lmg %r13, %r15, 392(%r15)
+; SCALAR-NEXT: br %r14
+ %Res = fadd <8 x half> %LHS, %RHS
+ ret <8 x half> %Res
+}
+
+; Scalarized operations, partial vector. TODO: The v4f16 is first widened and
+; then scalarized, which unfortunately results in 8 scalar operations. Maybe
+; the DAGCombiner could be helped to handle EXTRACT_SUBVECTOR in this where
+; the operands start out as full vectors.
+define <4 x half> @fun1(<4 x half> %LHS, <4 x half> %RHS) {
+; VECTOR-LABEL: fun1:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -248
+; VECTOR-NEXT: .cfi_def_cfa_offset 408
+; VECTOR-NEXT: std %f8, 240(%r15) # 8-byte Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: vst %v26, 176(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vst %v24, 160(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vreph %v0, %v26, 7
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 7
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: sebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 6
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 6
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: sebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v0, %v1
+; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 5
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 5
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: sebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 4
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 4
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: sebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v0, %v1
+; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vmrhf %v0, %v0, %v1
+; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 3
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 3
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: sebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 2
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 2
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: sebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v0, %v1
+; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: sebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vst %v0, 224(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 1
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 1
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: sebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: vl %v1, 224(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v1, %v0
+; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vmrhf %v0, %v0, %v1
+; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: ld %f8, 240(%r15) # 8-byte Reload
+; VECTOR-NEXT: vmrhg %v24, %v0, %v1
+; VECTOR-NEXT: lmg %r14, %r15, 360(%r15)
+; VECTOR-NEXT: br %r14
+;
+; SCALAR-LABEL: fun1:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: stmg %r14, %r15, 112(%r15)
+; SCALAR-NEXT: .cfi_offset %r14, -48
+; SCALAR-NEXT: .cfi_offset %r15, -40
+; SCALAR-NEXT: aghi %r15, -224
+; SCALAR-NEXT: .cfi_def_cfa_offset 384
+; SCALAR-NEXT: std %f8, 216(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f9, 208(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f10, 200(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f11, 192(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f12, 184(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f13, 176(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f14, 168(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f15, 160(%r15) # 8-byte Spill
+; SCALAR-NEXT: .cfi_offset %f8, -168
+; SCALAR-NEXT: .cfi_offset %f9, -176
+; SCALAR-NEXT: .cfi_offset %f10, -184
+; SCALAR-NEXT: .cfi_offset %f11, -192
+; SCALAR-NEXT: .cfi_offset %f12, -200
+; SCALAR-NEXT: .cfi_offset %f13, -208
+; SCALAR-NEXT: .cfi_offset %f14, -216
+; SCALAR-NEXT: .cfi_offset %f15, -224
+; SCALAR-NEXT: lgh %r0, 414(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f9, %r0
+; SCALAR-NEXT: lgh %r0, 406(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f13, %r0
+; SCALAR-NEXT: lgh %r0, 398(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f14, %r0
+; SCALAR-NEXT: lgh %r0, 390(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ler %f8, %f6
+; SCALAR-NEXT: ler %f10, %f4
+; SCALAR-NEXT: ler %f12, %f2
+; SCALAR-NEXT: ler %f11, %f0
+; SCALAR-NEXT: ldgr %f0, %r0
+; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ler %f15, %f0
+; SCALAR-NEXT: ler %f0, %f11
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: sebr %f0, %f15
+; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; SCALAR-NEXT: ler %f11, %f0
+; SCALAR-NEXT: ler %f0, %f14
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ler %f14, %f0
+; SCALAR-NEXT: ler %f0, %f12
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: sebr %f0, %f14
+; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; SCALAR-NEXT: ler %f12, %f0
+; SCALAR-NEXT: ler %f0, %f13
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ler %f13, %f0
+; SCALAR-NEXT: ler %f0, %f10
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: sebr %f0, %f13
+; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; SCALAR-NEXT: ler %f10, %f0
+; SCALAR-NEXT: ler %f0, %f9
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ler %f9, %f0
+; SCALAR-NEXT: ler %f0, %f8
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: sebr %f0, %f9
+; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; SCALAR-NEXT: ler %f6, %f0
+; SCALAR-NEXT: ler %f0, %f11
+; SCALAR-NEXT: ler %f2, %f12
+; SCALAR-NEXT: ler %f4, %f10
+; SCALAR-NEXT: ld %f8, 216(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f9, 208(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f10, 200(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f11, 192(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f12, 184(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f13, 176(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f14, 168(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f15, 160(%r15) # 8-byte Reload
+; SCALAR-NEXT: lmg %r14, %r15, 336(%r15)
+; SCALAR-NEXT: br %r14
+ %Res = fsub <4 x half> %LHS, %RHS
+ ret <4 x half> %Res
+}
+
+; Same, but the resulting v4f16 is stored instead and
+; SimplifyDemandedVectorElts() can remove the unneeded scalar operations.
+; (SCALAR_TO_VECTOR handling in combineExtract)
+define void @fun2(<4 x half> %LHS, <4 x half> %RHS, ptr %Dst) {
+; VECTOR-LABEL: fun2:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -232
+; VECTOR-NEXT: .cfi_def_cfa_offset 392
+; VECTOR-NEXT: std %f8, 224(%r15) # 8-byte Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: lgr %r13, %r2
+; VECTOR-NEXT: vst %v26, 192(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vst %v24, 160(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vreph %v0, %v26, 3
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 3
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: sebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 2
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 2
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: sebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v0, %v1
+; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: sebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 1
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 1
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: sebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v1, %v0
+; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vmrhf %v0, %v0, %v1
+; VECTOR-NEXT: vmrhf %v1, %v0, %v0
+; VECTOR-NEXT: ld %f8, 224(%r15) # 8-byte Reload
+; VECTOR-NEXT: vmrhg %v0, %v0, %v1
+; VECTOR-NEXT: vsteg %v0, 0(%r13), 0
+; VECTOR-NEXT: lmg %r13, %r15, 336(%r15)
+; VECTOR-NEXT: br %r14
+;
+; SCALAR-LABEL: fun2:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: stmg %r13, %r15, 104(%r15)
+; SCALAR-NEXT: .cfi_offset %r13, -56
+; SCALAR-NEXT: .cfi_offset %r14, -48
+; SCALAR-NEXT: .cfi_offset %r15, -40
+; SCALAR-NEXT: aghi %r15, -224
+; SCALAR-NEXT: .cfi_def_cfa_offset 384
+; SCALAR-NEXT: std %f8, 216(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f9, 208(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f10, 200(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f11, 192(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f12, 184(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f13, 176(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f14, 168(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f15, 160(%r15) # 8-byte Spill
+; SCALAR-NEXT: .cfi_offset %f8, -168
+; SCALAR-NEXT: .cfi_offset %f9, -176
+; SCALAR-NEXT: .cfi_offset %f10, -184
+; SCALAR-NEXT: .cfi_offset %f11, -192
+; SCALAR-NEXT: .cfi_offset %f12, -200
+; SCALAR-NEXT: .cfi_offset %f13, -208
+; SCALAR-NEXT: .cfi_offset %f14, -216
+; SCALAR-NEXT: .cfi_offset %f15, -224
+; SCALAR-NEXT: lgh %r0, 414(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f10, %r0
+; SCALAR-NEXT: lgh %r0, 406(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f13, %r0
+; SCALAR-NEXT: lgh %r0, 398(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f14, %r0
+; SCALAR-NEXT: lgh %r0, 390(%r15)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: lgr %r13, %r2
+; SCALAR-NEXT: ler %f8, %f6
+; SCALAR-NEXT: ler %f11, %f4
+; SCALAR-NEXT: ler %f12, %f2
+; SCALAR-NEXT: ler %f9, %f0
+; SCALAR-NEXT: ldgr %f0, %r0
+; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ler %f15, %f0
+; SCALAR-NEXT: ler %f0, %f9
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: sebr %f0, %f15
+; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; SCALAR-NEXT: ler %f9, %f0
+; SCALAR-NEXT: ler %f0, %f14
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ler %f14, %f0
+; SCALAR-NEXT: ler %f0, %f12
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: sebr %f0, %f14
+; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; SCALAR-NEXT: ler %f12, %f0
+; SCALAR-NEXT: ler %f0, %f13
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ler %f13, %f0
+; SCALAR-NEXT: ler %f0, %f11
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: sebr %f0, %f13
+; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; SCALAR-NEXT: ler %f11, %f0
+; SCALAR-NEXT: ler %f0, %f10
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ler %f10, %f0
+; SCALAR-NEXT: ler %f0, %f8
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: sebr %f0, %f10
+; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; SCALAR-NEXT: # kill: def $f0h killed $f0h def $f0d
+; SCALAR-NEXT: lgdr %r0, %f0
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 6(%r13)
+; SCALAR-NEXT: lgdr %r0, %f11
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 4(%r13)
+; SCALAR-NEXT: lgdr %r0, %f12
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 2(%r13)
+; SCALAR-NEXT: lgdr %r0, %f9
+; SCALAR-NEXT: ld %f8, 216(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f9, 208(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f10, 200(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f11, 192(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f12, 184(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f13, 176(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f14, 168(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f15, 160(%r15) # 8-byte Reload
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 0(%r13)
+; SCALAR-NEXT: lmg %r13, %r15, 328(%r15)
+; SCALAR-NEXT: br %r14
+ %Res = fsub <4 x half> %LHS, %RHS
+ store <4 x half> %Res, ptr %Dst
+ ret void
+}
+
+; The handling in combineExtract() works, but due to the order DAGCombiner
+; revisits nodes and users, the fsubs are replaced with NaNs instead of
+; Undefs (see comment in foldConstantFPMath()). Thus the vrepih below.
+define <4 x half> @fun3(ptr %Src, ptr %Dst) {
+; VECTOR-LABEL: fun3:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -232
+; VECTOR-NEXT: .cfi_def_cfa_offset 392
+; VECTOR-NEXT: std %f8, 224(%r15) # 8-byte Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: vlrepg %v0, 0(%r2)
+; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vlrepg %v0, 8(%r2)
+; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vreph %v0, %v0, 3
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 3
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: sebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 2
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 2
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: sebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v0, %v1
+; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: sebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 1
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 1
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: sebr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v1, %v0
+; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vmrhf %v0, %v0, %v1
+; VECTOR-NEXT: vrepih %v1, 32256
+; VECTOR-NEXT: vmrhh %v1, %v1, %v1
+; VECTOR-NEXT: ld %f8, 224(%r15) # 8-byte Reload
+; VECTOR-NEXT: vmrhf %v1, %v1, %v1
+; VECTOR-NEXT: vmrhg %v24, %v0, %v1
+; VECTOR-NEXT: lmg %r14, %r15, 344(%r15)
+; VECTOR-NEXT: br %r14
+;
+; SCALAR-LABEL: fun3:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: stmg %r14, %r15, 112(%r15)
+; SCALAR-NEXT: .cfi_offset %r14, -48
+; SCALAR-NEXT: .cfi_offset %r15, -40
+; SCALAR-NEXT: aghi %r15, -224
+; SCALAR-NEXT: .cfi_def_cfa_offset 384
+; SCALAR-NEXT: std %f8, 216(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f9, 208(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f10, 200(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f11, 192(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f12, 184(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f13, 176(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f14, 168(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f15, 160(%r15) # 8-byte Spill
+; SCALAR-NEXT: .cfi_offset %f8, -168
+; SCALAR-NEXT: .cfi_offset %f9, -176
+; SCALAR-NEXT: .cfi_offset %f10, -184
+; SCALAR-NEXT: .cfi_offset %f11, -192
+; SCALAR-NEXT: .cfi_offset %f12, -200
+; SCALAR-NEXT: .cfi_offset %f13, -208
+; SCALAR-NEXT: .cfi_offset %f14, -216
+; SCALAR-NEXT: .cfi_offset %f15, -224
+; SCALAR-NEXT: lgh %r0, 6(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f8, %r0
+; SCALAR-NEXT: lgh %r0, 4(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f9, %r0
+; SCALAR-NEXT: lgh %r0, 2(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f11, %r0
+; SCALAR-NEXT: lgh %r0, 0(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f12, %r0
+; SCALAR-NEXT: lgh %r0, 14(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f10, %r0
+; SCALAR-NEXT: lgh %r0, 12(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f13, %r0
+; SCALAR-NEXT: lgh %r0, 10(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f14, %r0
+; SCALAR-NEXT: lgh %r0, 8(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f0, %r0
+; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ler %f15, %f0
+; SCALAR-NEXT: ler %f0, %f12
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: sebr %f0, %f15
+; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; SCALAR-NEXT: ler %f12, %f0
+; SCALAR-NEXT: ler %f0, %f14
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ler %f14, %f0
+; SCALAR-NEXT: ler %f0, %f11
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: sebr %f0, %f14
+; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; SCALAR-NEXT: ler %f11, %f0
+; SCALAR-NEXT: ler %f0, %f13
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ler %f13, %f0
+; SCALAR-NEXT: ler %f0, %f9
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: sebr %f0, %f13
+; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; SCALAR-NEXT: ler %f9, %f0
+; SCALAR-NEXT: ler %f0, %f10
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ler %f10, %f0
+; SCALAR-NEXT: ler %f0, %f8
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: sebr %f0, %f10
+; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; SCALAR-NEXT: ler %f6, %f0
+; SCALAR-NEXT: ler %f0, %f12
+; SCALAR-NEXT: ler %f2, %f11
+; SCALAR-NEXT: ler %f4, %f9
+; SCALAR-NEXT: ld %f8, 216(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f9, 208(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f10, 200(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f11, 192(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f12, 184(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f13, 176(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f14, 168(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f15, 160(%r15) # 8-byte Reload
+; SCALAR-NEXT: lmg %r14, %r15, 336(%r15)
+; SCALAR-NEXT: br %r14
+ %L0 = load <4 x half>, ptr %Src
+ %Ptr1 = getelementptr <4 x half>, ptr %Src, i64 1
+ %L1 = load <4 x half>, ptr %Ptr1
+ %Res = fsub <4 x half> %L0, %L1
+ ret <4 x half> %Res
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-conversions.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-conversions.ll
new file mode 100644
index 0000000000000..9f926c0e640b6
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-conversions.ll
@@ -0,0 +1,2 @@
+; TODO:
+; bitconvert, SCALAR_TO_VECTOR, merge-high, merge-low
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-fcmp-vsel.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-fcmp-vsel.ll
new file mode 100644
index 0000000000000..b7dbaea2188c4
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-fcmp-vsel.ll
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=VECTOR
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=SCALAR
+
+define <4 x i1> @fun0(ptr %Src) {
+; VECTOR-LABEL: fun0:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r12, %r15, 96(%r15)
+; VECTOR-NEXT: .cfi_offset %r12, -64
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -192
+; VECTOR-NEXT: .cfi_def_cfa_offset 352
+; VECTOR-NEXT: vlrepg %v0, 0(%r2)
+; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vreph %v0, %v0, 3
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ltebr %f0, %f0
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: lhi %r12, 0
+; VECTOR-NEXT: lhi %r13, 0
+; VECTOR-NEXT: lochie %r12, -1
+; VECTOR-NEXT: vreph %v0, %v0, 1
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ltebr %f0, %f0
+; VECTOR-NEXT: lhi %r0, 0
+; VECTOR-NEXT: lochie %r0, -1
+; VECTOR-NEXT: vlvgp %v0, %r0, %r12
+; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ltebr %f0, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: lhi %r0, 0
+; VECTOR-NEXT: lochie %r0, -1
+; VECTOR-NEXT: vlvgh %v0, %r0, 1
+; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 2
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ltebr %f0, %f0
+; VECTOR-NEXT: vl %v24, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: lochie %r13, -1
+; VECTOR-NEXT: vlvgh %v24, %r13, 5
+; VECTOR-NEXT: lmg %r12, %r15, 288(%r15)
+; VECTOR-NEXT: br %r14
+;
+; SCALAR-LABEL: fun0:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: stmg %r11, %r15, 88(%r15)
+; SCALAR-NEXT: .cfi_offset %r11, -72
+; SCALAR-NEXT: .cfi_offset %r12, -64
+; SCALAR-NEXT: .cfi_offset %r13, -56
+; SCALAR-NEXT: .cfi_offset %r14, -48
+; SCALAR-NEXT: .cfi_offset %r15, -40
+; SCALAR-NEXT: aghi %r15, -184
+; SCALAR-NEXT: .cfi_def_cfa_offset 344
+; SCALAR-NEXT: std %f8, 176(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f9, 168(%r15) # 8-byte Spill
+; SCALAR-NEXT: std %f10, 160(%r15) # 8-byte Spill
+; SCALAR-NEXT: .cfi_offset %f8, -168
+; SCALAR-NEXT: .cfi_offset %f9, -176
+; SCALAR-NEXT: .cfi_offset %f10, -184
+; SCALAR-NEXT: lgh %r0, 6(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f8, %r0
+; SCALAR-NEXT: lgh %r0, 4(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f9, %r0
+; SCALAR-NEXT: lgh %r0, 2(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f10, %r0
+; SCALAR-NEXT: lgh %r0, 0(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f0, %r0
+; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ltebr %f0, %f0
+; SCALAR-NEXT: ler %f0, %f10
+; SCALAR-NEXT: ipm %r13
+; SCALAR-NEXT: afi %r13, -268435456
+; SCALAR-NEXT: srl %r13, 31
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ltebr %f0, %f0
+; SCALAR-NEXT: ler %f0, %f9
+; SCALAR-NEXT: ipm %r12
+; SCALAR-NEXT: afi %r12, -268435456
+; SCALAR-NEXT: srl %r12, 31
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ltebr %f0, %f0
+; SCALAR-NEXT: ler %f0, %f8
+; SCALAR-NEXT: ipm %r11
+; SCALAR-NEXT: afi %r11, -268435456
+; SCALAR-NEXT: srl %r11, 31
+; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; SCALAR-NEXT: ltebr %f0, %f0
+; SCALAR-NEXT: ld %f8, 176(%r15) # 8-byte Reload
+; SCALAR-NEXT: ipm %r5
+; SCALAR-NEXT: ld %f9, 168(%r15) # 8-byte Reload
+; SCALAR-NEXT: ld %f10, 160(%r15) # 8-byte Reload
+; SCALAR-NEXT: afi %r5, -268435456
+; SCALAR-NEXT: srl %r5, 31
+; SCALAR-NEXT: lr %r2, %r13
+; SCALAR-NEXT: lr %r3, %r12
+; SCALAR-NEXT: lr %r4, %r11
+; SCALAR-NEXT: lmg %r11, %r15, 272(%r15)
+; SCALAR-NEXT: br %r14
+ %1 = load <4 x half>, ptr %Src
+ %2 = fcmp oeq <4 x half> %1, zeroinitializer
+ ret <4 x i1> %2
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-mem.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-mem.ll
new file mode 100644
index 0000000000000..30bbc7de08dd7
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-mem.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=VECTOR
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=SCALAR
+
+define void @fun0(ptr %Src, ptr %Dst) {
+; VECTOR-LABEL: fun0:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: vl %v0, 0(%r2), 3
+; VECTOR-NEXT: vst %v0, 0(%r3), 3
+; VECTOR-NEXT: br %r14
+;
+; SCALAR-LABEL: fun0:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: lgh %r0, 0(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f0, %r0
+; SCALAR-NEXT: lgh %r0, 2(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f1, %r0
+; SCALAR-NEXT: lgh %r0, 4(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f2, %r0
+; SCALAR-NEXT: lgh %r0, 6(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f3, %r0
+; SCALAR-NEXT: lgh %r0, 8(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f4, %r0
+; SCALAR-NEXT: lgh %r0, 10(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f5, %r0
+; SCALAR-NEXT: lgh %r0, 12(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f6, %r0
+; SCALAR-NEXT: lgh %r0, 14(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f7, %r0
+; SCALAR-NEXT: lgdr %r0, %f7
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 14(%r3)
+; SCALAR-NEXT: lgdr %r0, %f6
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 12(%r3)
+; SCALAR-NEXT: lgdr %r0, %f5
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 10(%r3)
+; SCALAR-NEXT: lgdr %r0, %f4
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 8(%r3)
+; SCALAR-NEXT: lgdr %r0, %f3
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 6(%r3)
+; SCALAR-NEXT: lgdr %r0, %f2
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 4(%r3)
+; SCALAR-NEXT: lgdr %r0, %f1
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 2(%r3)
+; SCALAR-NEXT: lgdr %r0, %f0
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 0(%r3)
+; SCALAR-NEXT: br %r14
+ %L = load <8 x half>, ptr %Src
+ store <8 x half> %L, ptr %Dst
+ ret void
+}
+
+define void @fun1(ptr %Src, ptr %Dst) {
+; VECTOR-LABEL: fun1:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: lg %r0, 0(%r2)
+; VECTOR-NEXT: stg %r0, 0(%r3)
+; VECTOR-NEXT: br %r14
+;
+; SCALAR-LABEL: fun1:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: lgh %r0, 4(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f0, %r0
+; SCALAR-NEXT: lgh %r0, 6(%r2)
+; SCALAR-NEXT: sllg %r0, %r0, 48
+; SCALAR-NEXT: ldgr %f1, %r0
+; SCALAR-NEXT: l %r0, 0(%r2)
+; SCALAR-NEXT: st %r0, 0(%r3)
+; SCALAR-NEXT: lgdr %r0, %f1
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 6(%r3)
+; SCALAR-NEXT: lgdr %r0, %f0
+; SCALAR-NEXT: srlg %r0, %r0, 48
+; SCALAR-NEXT: sth %r0, 4(%r3)
+; SCALAR-NEXT: br %r14
+ %L = load <4 x half>, ptr %Src
+ store <4 x half> %L, ptr %Dst
+ ret void
+}
+
+define void @fun2(ptr %Src, ptr %Dst) {
+; VECTOR-LABEL: fun2:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: vl %v0, 0(%r2), 4
+; VECTOR-NEXT: vst %v0, 0(%r3), 4
+; VECTOR-NEXT: lg %r0, 16(%r2)
+; VECTOR-NEXT: stg %r0, 16(%r3)
+; VECTOR-NEXT: br %r14
+;
+; SCALAR-LABEL: fun2:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: lg %r0, 16(%r2)
+; SCALAR-NEXT: lg %r1, 8(%r2)
+; SCALAR-NEXT: lg %r2, 0(%r2)
+; SCALAR-NEXT: stg %r2, 0(%r3)
+; SCALAR-NEXT: stg %r1, 8(%r3)
+; SCALAR-NEXT: stg %r0, 16(%r3)
+; SCALAR-NEXT: br %r14
+ %L = load <12 x half>, ptr %Src
+ store <12 x half> %L, ptr %Dst
+ ret void
+}
+
+define void @fun3(ptr %Src, ptr %Dst) {
+; VECTOR-LABEL: fun3:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: vl %v0, 16(%r2), 4
+; VECTOR-NEXT: vl %v1, 0(%r2), 4
+; VECTOR-NEXT: vst %v1, 0(%r3), 4
+; VECTOR-NEXT: vst %v0, 16(%r3), 4
+; VECTOR-NEXT: br %r14
+;
+; SCALAR-LABEL: fun3:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: lg %r0, 0(%r2)
+; SCALAR-NEXT: lg %r1, 8(%r2)
+; SCALAR-NEXT: lg %r4, 16(%r2)
+; SCALAR-NEXT: lg %r2, 24(%r2)
+; SCALAR-NEXT: stg %r2, 24(%r3)
+; SCALAR-NEXT: stg %r4, 16(%r3)
+; SCALAR-NEXT: stg %r1, 8(%r3)
+; SCALAR-NEXT: stg %r0, 0(%r3)
+; SCALAR-NEXT: br %r14
+ %L = load <16 x half>, ptr %Src
+ store <16 x half> %L, ptr %Dst
+ ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector.ll
index 4997c5b0c617d..824d917444e07 100644
--- a/llvm/test/CodeGen/SystemZ/fp-half-vector.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-half-vector.ll
@@ -124,91 +124,88 @@ define <8 x half> @fun0(<8 x half> %Op) {
;
; VECTOR-LABEL: fun0:
; VECTOR: # %bb.0: # %entry
-; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
-; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
; VECTOR-NEXT: .cfi_offset %r14, -48
; VECTOR-NEXT: .cfi_offset %r15, -40
; VECTOR-NEXT: aghi %r15, -224
; VECTOR-NEXT: .cfi_def_cfa_offset 384
-; VECTOR-NEXT: std %f8, 216(%r15) # 8-byte Spill
-; VECTOR-NEXT: std %f9, 208(%r15) # 8-byte Spill
-; VECTOR-NEXT: std %f10, 200(%r15) # 8-byte Spill
-; VECTOR-NEXT: std %f11, 192(%r15) # 8-byte Spill
-; VECTOR-NEXT: std %f12, 184(%r15) # 8-byte Spill
-; VECTOR-NEXT: std %f13, 176(%r15) # 8-byte Spill
-; VECTOR-NEXT: std %f14, 168(%r15) # 8-byte Spill
-; VECTOR-NEXT: std %f15, 160(%r15) # 8-byte Spill
-; VECTOR-NEXT: .cfi_offset %f8, -168
-; VECTOR-NEXT: .cfi_offset %f9, -176
-; VECTOR-NEXT: .cfi_offset %f10, -184
-; VECTOR-NEXT: .cfi_offset %f11, -192
-; VECTOR-NEXT: .cfi_offset %f12, -200
-; VECTOR-NEXT: .cfi_offset %f13, -208
-; VECTOR-NEXT: .cfi_offset %f14, -216
-; VECTOR-NEXT: .cfi_offset %f15, -224
-; VECTOR-NEXT: vlreph %v11, 414(%r15)
-; VECTOR-NEXT: vlreph %v12, 406(%r15)
-; VECTOR-NEXT: vlreph %v13, 398(%r15)
-; VECTOR-NEXT: vlreph %v14, 390(%r15)
-; VECTOR-NEXT: ldr %f8, %f6
-; VECTOR-NEXT: ldr %f9, %f4
-; VECTOR-NEXT: ldr %f10, %f2
-; VECTOR-NEXT: lgr %r13, %r2
+; VECTOR-NEXT: vst %v24, 160(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vreph %v0, %v24, 7
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
; VECTOR-NEXT: aebr %f0, %f0
; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: ldr %f15, %f0
-; VECTOR-NEXT: ldr %f0, %f10
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 6
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
; VECTOR-NEXT: aebr %f0, %f0
; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: ldr %f10, %f0
-; VECTOR-NEXT: ldr %f0, %f9
+; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v0, %v1
+; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 5
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
; VECTOR-NEXT: aebr %f0, %f0
; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: ldr %f9, %f0
-; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 4
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
; VECTOR-NEXT: aebr %f0, %f0
; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: ldr %f8, %f0
-; VECTOR-NEXT: ldr %f0, %f14
+; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v0, %v1
+; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vmrhf %v0, %v0, %v1
+; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 3
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
; VECTOR-NEXT: aebr %f0, %f0
; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: ldr %f14, %f0
-; VECTOR-NEXT: ldr %f0, %f13
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 2
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
; VECTOR-NEXT: aebr %f0, %f0
; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: ldr %f13, %f0
-; VECTOR-NEXT: ldr %f0, %f12
+; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v0, %v1
+; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
; VECTOR-NEXT: aebr %f0, %f0
; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: ldr %f12, %f0
-; VECTOR-NEXT: ldr %f0, %f11
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 1
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
; VECTOR-NEXT: aebr %f0, %f0
; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: vsteh %v0, 14(%r13), 0
-; VECTOR-NEXT: vsteh %v12, 12(%r13), 0
-; VECTOR-NEXT: vsteh %v13, 10(%r13), 0
-; VECTOR-NEXT: vsteh %v14, 8(%r13), 0
-; VECTOR-NEXT: vsteh %v8, 6(%r13), 0
-; VECTOR-NEXT: vsteh %v9, 4(%r13), 0
-; VECTOR-NEXT: vsteh %v10, 2(%r13), 0
-; VECTOR-NEXT: vsteh %v15, 0(%r13), 0
-; VECTOR-NEXT: ld %f8, 216(%r15) # 8-byte Reload
-; VECTOR-NEXT: ld %f9, 208(%r15) # 8-byte Reload
-; VECTOR-NEXT: ld %f10, 200(%r15) # 8-byte Reload
-; VECTOR-NEXT: ld %f11, 192(%r15) # 8-byte Reload
-; VECTOR-NEXT: ld %f12, 184(%r15) # 8-byte Reload
-; VECTOR-NEXT: ld %f13, 176(%r15) # 8-byte Reload
-; VECTOR-NEXT: ld %f14, 168(%r15) # 8-byte Reload
-; VECTOR-NEXT: ld %f15, 160(%r15) # 8-byte Reload
-; VECTOR-NEXT: lmg %r13, %r15, 328(%r15)
+; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v1, %v0
+; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vmrhf %v0, %v0, %v1
+; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vmrhg %v24, %v0, %v1
+; VECTOR-NEXT: lmg %r14, %r15, 336(%r15)
; VECTOR-NEXT: br %r14
entry:
%Res = fadd <8 x half> %Op, %Op
@@ -269,46 +266,85 @@ define <4 x half> @fun1(<4 x half> %Op) {
; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
; VECTOR-NEXT: .cfi_offset %r14, -48
; VECTOR-NEXT: .cfi_offset %r15, -40
-; VECTOR-NEXT: aghi %r15, -192
-; VECTOR-NEXT: .cfi_def_cfa_offset 352
-; VECTOR-NEXT: std %f8, 184(%r15) # 8-byte Spill
-; VECTOR-NEXT: std %f9, 176(%r15) # 8-byte Spill
-; VECTOR-NEXT: std %f10, 168(%r15) # 8-byte Spill
-; VECTOR-NEXT: std %f11, 160(%r15) # 8-byte Spill
-; VECTOR-NEXT: .cfi_offset %f8, -168
-; VECTOR-NEXT: .cfi_offset %f9, -176
-; VECTOR-NEXT: .cfi_offset %f10, -184
-; VECTOR-NEXT: .cfi_offset %f11, -192
-; VECTOR-NEXT: ldr %f8, %f6
-; VECTOR-NEXT: ldr %f9, %f4
-; VECTOR-NEXT: ldr %f10, %f2
+; VECTOR-NEXT: aghi %r15, -224
+; VECTOR-NEXT: .cfi_def_cfa_offset 384
+; VECTOR-NEXT: vst %v24, 160(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vreph %v0, %v24, 7
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
; VECTOR-NEXT: aebr %f0, %f0
; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: ldr %f11, %f0
-; VECTOR-NEXT: ldr %f0, %f10
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 6
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
; VECTOR-NEXT: aebr %f0, %f0
; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: ldr %f10, %f0
-; VECTOR-NEXT: ldr %f0, %f9
+; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v0, %v1
+; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 5
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
; VECTOR-NEXT: aebr %f0, %f0
; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: ldr %f9, %f0
-; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 4
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
; VECTOR-NEXT: aebr %f0, %f0
; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: ldr %f6, %f0
-; VECTOR-NEXT: ldr %f0, %f11
-; VECTOR-NEXT: ldr %f2, %f10
-; VECTOR-NEXT: ldr %f4, %f9
-; VECTOR-NEXT: ld %f8, 184(%r15) # 8-byte Reload
-; VECTOR-NEXT: ld %f9, 176(%r15) # 8-byte Reload
-; VECTOR-NEXT: ld %f10, 168(%r15) # 8-byte Reload
-; VECTOR-NEXT: ld %f11, 160(%r15) # 8-byte Reload
-; VECTOR-NEXT: lmg %r14, %r15, 304(%r15)
+; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v0, %v1
+; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vmrhf %v0, %v0, %v1
+; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 3
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 2
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v0, %v1
+; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 1
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v1, %v0
+; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vmrhf %v0, %v0, %v1
+; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vmrhg %v24, %v0, %v1
+; VECTOR-NEXT: lmg %r14, %r15, 336(%r15)
; VECTOR-NEXT: br %r14
entry:
%Res = fadd <4 x half> %Op, %Op
@@ -353,33 +389,38 @@ define <2 x half> @fun2(<2 x half> %Op) {
; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
; VECTOR-NEXT: .cfi_offset %r14, -48
; VECTOR-NEXT: .cfi_offset %r15, -40
-; VECTOR-NEXT: aghi %r15, -184
-; VECTOR-NEXT: .cfi_def_cfa_offset 344
-; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Spill
-; VECTOR-NEXT: .cfi_offset %f8, -168
-; VECTOR-NEXT: ldr %f8, %f0
-; VECTOR-NEXT: ldr %f0, %f2
+; VECTOR-NEXT: aghi %r15, -192
+; VECTOR-NEXT: .cfi_def_cfa_offset 352
+; VECTOR-NEXT: vlr %v0, %v24
+; VECTOR-NEXT: vst %v24, 160(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
; VECTOR-NEXT: brasl %r14, __extendhfdf2 at PLT
; VECTOR-NEXT: # kill: def $f0d killed $f0d def $v0
-; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 1
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
; VECTOR-NEXT: brasl %r14, __extendhfdf2 at PLT
-; VECTOR-NEXT: vl %v1, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
; VECTOR-NEXT: # kill: def $f0d killed $f0d def $v0
-; VECTOR-NEXT: vmrhg %v0, %v0, %v1
+; VECTOR-NEXT: vmrhg %v0, %v1, %v0
; VECTOR-NEXT: vfadb %v0, %v0, %v0
; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill
; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0
; VECTOR-NEXT: brasl %r14, __truncdfhf2 at PLT
-; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
; VECTOR-NEXT: vrepg %v0, %v0, 1
; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0
; VECTOR-NEXT: brasl %r14, __truncdfhf2 at PLT
-; VECTOR-NEXT: ldr %f2, %f0
-; VECTOR-NEXT: ldr %f0, %f8
-; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Reload
-; VECTOR-NEXT: lmg %r14, %r15, 296(%r15)
+; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v1, %v0
+; VECTOR-NEXT: vmrhf %v0, %v0, %v0
+; VECTOR-NEXT: vmrhf %v1, %v0, %v0
+; VECTOR-NEXT: vmrhg %v24, %v0, %v1
+; VECTOR-NEXT: lmg %r14, %r15, 304(%r15)
; VECTOR-NEXT: br %r14
entry:
%E = fpext <2 x half> %Op to <2 x double>
@@ -444,22 +485,8 @@ define void @fun3(ptr %Src, ptr %Dst) {
;
; VECTOR-LABEL: fun3:
; VECTOR: # %bb.0: # %entry
-; VECTOR-NEXT: vlreph %v0, 0(%r2)
-; VECTOR-NEXT: vlreph %v1, 2(%r2)
-; VECTOR-NEXT: vlreph %v2, 4(%r2)
-; VECTOR-NEXT: vlreph %v3, 6(%r2)
-; VECTOR-NEXT: vlreph %v4, 8(%r2)
-; VECTOR-NEXT: vlreph %v5, 10(%r2)
-; VECTOR-NEXT: vlreph %v6, 12(%r2)
-; VECTOR-NEXT: vlreph %v7, 14(%r2)
-; VECTOR-NEXT: vsteh %v7, 14(%r3), 0
-; VECTOR-NEXT: vsteh %v6, 12(%r3), 0
-; VECTOR-NEXT: vsteh %v5, 10(%r3), 0
-; VECTOR-NEXT: vsteh %v4, 8(%r3), 0
-; VECTOR-NEXT: vsteh %v3, 6(%r3), 0
-; VECTOR-NEXT: vsteh %v2, 4(%r3), 0
-; VECTOR-NEXT: vsteh %v1, 2(%r3), 0
-; VECTOR-NEXT: vsteh %v0, 0(%r3), 0
+; VECTOR-NEXT: vl %v0, 0(%r2), 3
+; VECTOR-NEXT: vst %v0, 0(%r3), 3
; VECTOR-NEXT: br %r14
entry:
%L = load <8 x half>, ptr %Src
@@ -578,40 +605,13 @@ define void @fun4(ptr %Src, ptr %Dst) {
; VECTOR-NEXT: .cfi_offset %r13, -56
; VECTOR-NEXT: .cfi_offset %r14, -48
; VECTOR-NEXT: .cfi_offset %r15, -40
-; VECTOR-NEXT: aghi %r15, -208
-; VECTOR-NEXT: .cfi_def_cfa_offset 368
-; VECTOR-NEXT: vlreph %v6, 6(%r2)
-; VECTOR-NEXT: vlreph %v4, 4(%r2)
-; VECTOR-NEXT: vlreph %v2, 2(%r2)
-; VECTOR-NEXT: vlreph %v0, 0(%r2)
-; VECTOR-NEXT: vlreph %v1, 8(%r2)
-; VECTOR-NEXT: vlreph %v3, 10(%r2)
-; VECTOR-NEXT: vlreph %v5, 12(%r2)
-; VECTOR-NEXT: vlreph %v7, 14(%r2)
-; VECTOR-NEXT: la %r2, 192(%r15)
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: vl %v24, 0(%r2), 3
; VECTOR-NEXT: lgr %r13, %r3
-; VECTOR-NEXT: vsteh %v7, 190(%r15), 0
-; VECTOR-NEXT: vsteh %v5, 182(%r15), 0
-; VECTOR-NEXT: vsteh %v3, 174(%r15), 0
-; VECTOR-NEXT: vsteh %v1, 166(%r15), 0
; VECTOR-NEXT: brasl %r14, foo at PLT
-; VECTOR-NEXT: vlreph %v0, 192(%r15)
-; VECTOR-NEXT: vlreph %v1, 194(%r15)
-; VECTOR-NEXT: vlreph %v2, 196(%r15)
-; VECTOR-NEXT: vlreph %v3, 198(%r15)
-; VECTOR-NEXT: vlreph %v4, 200(%r15)
-; VECTOR-NEXT: vlreph %v5, 202(%r15)
-; VECTOR-NEXT: vlreph %v6, 204(%r15)
-; VECTOR-NEXT: vlreph %v7, 206(%r15)
-; VECTOR-NEXT: vsteh %v7, 14(%r13), 0
-; VECTOR-NEXT: vsteh %v6, 12(%r13), 0
-; VECTOR-NEXT: vsteh %v5, 10(%r13), 0
-; VECTOR-NEXT: vsteh %v4, 8(%r13), 0
-; VECTOR-NEXT: vsteh %v3, 6(%r13), 0
-; VECTOR-NEXT: vsteh %v2, 4(%r13), 0
-; VECTOR-NEXT: vsteh %v1, 2(%r13), 0
-; VECTOR-NEXT: vsteh %v0, 0(%r13), 0
-; VECTOR-NEXT: lmg %r13, %r15, 312(%r15)
+; VECTOR-NEXT: vst %v24, 0(%r13), 3
+; VECTOR-NEXT: lmg %r13, %r15, 264(%r15)
; VECTOR-NEXT: br %r14
entry:
%arg = load <8 x half>, ptr %Src
@@ -699,26 +699,10 @@ define void @fun5(<4 x half> %dummy, <8 x half> %Arg5) {
; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
; VECTOR-NEXT: .cfi_offset %r14, -48
; VECTOR-NEXT: .cfi_offset %r15, -40
-; VECTOR-NEXT: aghi %r15, -224
-; VECTOR-NEXT: .cfi_def_cfa_offset 384
-; VECTOR-NEXT: vlreph %v1, 390(%r15)
-; VECTOR-NEXT: vlreph %v3, 398(%r15)
-; VECTOR-NEXT: vlreph %v5, 406(%r15)
-; VECTOR-NEXT: vlreph %v7, 414(%r15)
-; VECTOR-NEXT: vlreph %v16, 422(%r15)
-; VECTOR-NEXT: vlreph %v17, 430(%r15)
-; VECTOR-NEXT: vlreph %v18, 438(%r15)
-; VECTOR-NEXT: vlreph %v19, 446(%r15)
-; VECTOR-NEXT: vsteh %v19, 222(%r15), 0
-; VECTOR-NEXT: vsteh %v18, 214(%r15), 0
-; VECTOR-NEXT: vsteh %v17, 206(%r15), 0
-; VECTOR-NEXT: vsteh %v16, 198(%r15), 0
-; VECTOR-NEXT: vsteh %v7, 190(%r15), 0
-; VECTOR-NEXT: vsteh %v5, 182(%r15), 0
-; VECTOR-NEXT: vsteh %v3, 174(%r15), 0
-; VECTOR-NEXT: vsteh %v1, 166(%r15), 0
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
; VECTOR-NEXT: brasl %r14, foo2 at PLT
-; VECTOR-NEXT: lmg %r14, %r15, 336(%r15)
+; VECTOR-NEXT: lmg %r14, %r15, 272(%r15)
; VECTOR-NEXT: br %r14
call void @foo2(<4 x half> %dummy, <8 x half> %Arg5)
ret void
>From 0f8bda08849d1d8eb2bb323062f6c182b436de4e Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Fri, 5 Dec 2025 00:27:37 +0100
Subject: [PATCH 2/4] Expand fp16 vectors, but handle ABI.
---
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 -
.../Target/SystemZ/SystemZISelLowering.cpp | 34 +-
llvm/lib/Target/SystemZ/SystemZISelLowering.h | 24 +-
.../CodeGen/SystemZ/fp-half-vector-args.ll | 1228 +++++++++--------
.../CodeGen/SystemZ/fp-half-vector-binops.ll | 1115 +++++----------
.../CodeGen/SystemZ/fp-half-vector-conv.ll | 178 +++
.../SystemZ/fp-half-vector-conversions.ll | 2 -
.../SystemZ/fp-half-vector-fcmp-select.ll | 503 +++++++
.../SystemZ/fp-half-vector-fcmp-vsel.ll | 118 --
.../CodeGen/SystemZ/fp-half-vector-mem.ll | 200 +--
llvm/test/CodeGen/SystemZ/fp-half-vector.ll | 709 ----------
11 files changed, 1841 insertions(+), 2272 deletions(-)
create mode 100644 llvm/test/CodeGen/SystemZ/fp-half-vector-conv.ll
delete mode 100644 llvm/test/CodeGen/SystemZ/fp-half-vector-conversions.ll
create mode 100644 llvm/test/CodeGen/SystemZ/fp-half-vector-fcmp-select.ll
delete mode 100644 llvm/test/CodeGen/SystemZ/fp-half-vector-fcmp-vsel.ll
delete mode 100644 llvm/test/CodeGen/SystemZ/fp-half-vector.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 9a8743cf44b85..b009e6a3d5f5f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7432,8 +7432,6 @@ SDValue SelectionDAG::foldConstantFPMath(unsigned Opcode, const SDLoc &DL,
case ISD::FREM:
// If both operands are undef, the result is undef. If 1 operand is undef,
// the result is NaN. This should match the behavior of the IR optimizer.
- // XXX What if the other operand will become undef later: NaN + undef
- // => undef?
if (N1.isUndef() && N2.isUndef())
return getUNDEF(VT);
if (N1.isUndef() || N2.isUndef())
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index de2e018680f5b..aaa6c22eaf01a 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -845,6 +845,33 @@ bool SystemZTargetLowering::useSoftFloat() const {
return Subtarget.hasSoftFloat();
}
+unsigned SystemZTargetLowering::getNumRegisters(LLVMContext &Context, EVT VT,
+ std::optional<MVT> RegisterVT) const {
+ // i128 inline assembly operand.
+ if (VT == MVT::i128 && RegisterVT && *RegisterVT == MVT::Untyped)
+ return 1;
+ // Pass narrow fp16 vectors per the ABI even though they are generally
+ // expanded.
+ if (Subtarget.hasVector() && VT.isVector() && VT.getScalarType() == MVT::f16)
+ return divideCeil(VT.getVectorNumElements(), SystemZ::VectorBytes / 2);
+ return TargetLowering::getNumRegisters(Context, VT);
+}
+
+MVT SystemZTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+ CallingConv::ID CC,
+ EVT VT) const {
+ // 128-bit single-element vector types are passed like other vectors,
+ // not like their element type.
+ if (VT.isVector() && VT.getSizeInBits() == 128 &&
+ VT.getVectorNumElements() == 1)
+ return MVT::v16i8;
+ // Pass narrow fp16 vectors per the ABI even though they are generally
+ // expanded.
+ if (Subtarget.hasVector() && VT.isVector() && VT.getScalarType() == MVT::f16)
+ return MVT::v8f16;
+ return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+}
+
EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL,
LLVMContext &, EVT VT) const {
if (!VT.isVector())
@@ -7585,13 +7612,6 @@ SDValue SystemZTargetLowering::combineExtract(const SDLoc &DL, EVT ResVT,
Op = Op.getOperand(0);
Index = Byte / BytesPerElement;
Force = true;
- } else if (Opcode == ISD::SCALAR_TO_VECTOR && ResVT == MVT::f16) {
- // The vector was first widened and then expanded. Expose undef
- // elements to eliminate the unneeded operations.
- EVT OpVT = Op.getValueType();
- if (Index * ResVT.getScalarSizeInBits() >= OpVT.getScalarSizeInBits())
- return DAG.getUNDEF(ResVT);
- break;
} else
break;
}
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 13a1cd1614a53..ca47b96ef2d80 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -64,27 +64,19 @@ class SystemZTargetLowering : public TargetLowering {
//
// (c) there are no multiplication instructions for the widest integer
// type (v2i64).
+
+ // Expand (narrow) f16 vectors during type legalization to avoid
+ // operations for all elements as with expansion after widening.
+ if (VT.getScalarType() == MVT::f16)
+ return VT.getVectorElementCount().isScalar() ? TypeScalarizeVector : TypeSplitVector;
if (VT.getScalarSizeInBits() % 8 == 0)
return TypeWidenVector;
return TargetLoweringBase::getPreferredVectorAction(VT);
}
- unsigned
- getNumRegisters(LLVMContext &Context, EVT VT,
- std::optional<MVT> RegisterVT) const override {
- // i128 inline assembly operand.
- if (VT == MVT::i128 && RegisterVT && *RegisterVT == MVT::Untyped)
- return 1;
- return TargetLowering::getNumRegisters(Context, VT);
- }
+ unsigned getNumRegisters(LLVMContext &Context, EVT VT,
+ std::optional<MVT> RegisterVT) const override;
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
- EVT VT) const override {
- // 128-bit single-element vector types are passed like other vectors,
- // not like their element type.
- if (VT.isVector() && VT.getSizeInBits() == 128 &&
- VT.getVectorNumElements() == 1)
- return MVT::v16i8;
- return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
- }
+ EVT VT) const override;
bool isCheapToSpeculateCtlz(Type *) const override { return true; }
bool isCheapToSpeculateCttz(Type *) const override { return true; }
bool preferZeroCompareBranch() const override { return true; }
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-args.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-args.ll
index aee9161bd29ae..381bfad51188f 100644
--- a/llvm/test/CodeGen/SystemZ/fp-half-vector-args.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-args.ll
@@ -1,639 +1,711 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
-; RUN: | FileCheck %s --check-prefix=VECTOR
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \
-; RUN: | FileCheck %s --check-prefix=SCALAR
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s --check-prefix=VECTOR
+;
+; Test passing fp16 vector arguments.
+
+ at Fnptr = external global ptr
+ at Src = external global ptr
+ at Dst = external global ptr
-; Function argument in vector register.
-declare void @foo0(<8 x half>)
-define void @fun0(<8 x half> %A, ptr %Src, ptr %Dst) {
-; VECTOR-LABEL: fun0:
+%Ty0 = type <8 x half>
+define void @fun0_arg(%Ty0 %A) {
+; CHECK-LABEL: fun0_arg:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lgh %r0, 166(%r15)
+; CHECK-NEXT: # kill: def $f6h killed $f6h def $f6d
+; CHECK-NEXT: # kill: def $f4h killed $f4h def $f4d
+; CHECK-NEXT: # kill: def $f2h killed $f2h def $f2d
+; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d
+; CHECK-NEXT: lgh %r1, 174(%r15)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f1, %r0
+; CHECK-NEXT: lgh %r0, 182(%r15)
+; CHECK-NEXT: sllg %r1, %r1, 48
+; CHECK-NEXT: lgh %r2, 190(%r15)
+; CHECK-NEXT: ldgr %f3, %r1
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f5, %r0
+; CHECK-NEXT: sllg %r0, %r2, 48
+; CHECK-NEXT: lgrl %r1, Dst at GOT
+; CHECK-NEXT: ldgr %f7, %r0
+; CHECK-NEXT: lgdr %r0, %f6
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 6(%r1)
+; CHECK-NEXT: lgdr %r0, %f4
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 4(%r1)
+; CHECK-NEXT: lgdr %r0, %f2
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 2(%r1)
+; CHECK-NEXT: lgdr %r0, %f0
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 0(%r1)
+; CHECK-NEXT: lgdr %r0, %f7
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 14(%r1)
+; CHECK-NEXT: lgdr %r0, %f5
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 12(%r1)
+; CHECK-NEXT: lgdr %r0, %f3
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 10(%r1)
+; CHECK-NEXT: lgdr %r0, %f1
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 8(%r1)
+; CHECK-NEXT: br %r14
+;
+; VECTOR-LABEL: fun0_arg:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: lgrl %r1, Dst at GOT
+; VECTOR-NEXT: vst %v24, 0(%r1), 3
+; VECTOR-NEXT: br %r14
+ store %Ty0 %A, ptr @Dst
+ ret void
+}
+
+define void @fun0_call() {
+; CHECK-LABEL: fun0_call:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -192
+; CHECK-NEXT: .cfi_def_cfa_offset 352
+; CHECK-NEXT: lgrl %r1, Src at GOT
+; CHECK-NEXT: lgh %r0, 0(%r1)
+; CHECK-NEXT: lgh %r2, 2(%r1)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f0, %r0
+; CHECK-NEXT: lgh %r0, 4(%r1)
+; CHECK-NEXT: sllg %r2, %r2, 48
+; CHECK-NEXT: ldgr %f2, %r2
+; CHECK-NEXT: lgh %r2, 6(%r1)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f4, %r0
+; CHECK-NEXT: lgh %r0, 8(%r1)
+; CHECK-NEXT: sllg %r2, %r2, 48
+; CHECK-NEXT: ldgr %f6, %r2
+; CHECK-NEXT: lgh %r2, 10(%r1)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f1, %r0
+; CHECK-NEXT: lgh %r0, 12(%r1)
+; CHECK-NEXT: sllg %r2, %r2, 48
+; CHECK-NEXT: lgh %r1, 14(%r1)
+; CHECK-NEXT: ldgr %f3, %r2
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f5, %r0
+; CHECK-NEXT: sllg %r0, %r1, 48
+; CHECK-NEXT: ldgr %f7, %r0
+; CHECK-NEXT: lgdr %r0, %f7
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 190(%r15)
+; CHECK-NEXT: lgdr %r0, %f5
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 182(%r15)
+; CHECK-NEXT: lgdr %r0, %f3
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 174(%r15)
+; CHECK-NEXT: lgdr %r0, %f1
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 166(%r15)
+; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; CHECK-NEXT: # kill: def $f2h killed $f2h killed $f2d
+; CHECK-NEXT: # kill: def $f4h killed $f4h killed $f4d
+; CHECK-NEXT: # kill: def $f6h killed $f6h killed $f6d
+; CHECK-NEXT: brasl %r14, Fnptr at PLT
+; CHECK-NEXT: lmg %r14, %r15, 304(%r15)
+; CHECK-NEXT: br %r14
+;
+; VECTOR-LABEL: fun0_call:
; VECTOR: # %bb.0:
; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
; VECTOR-NEXT: .cfi_offset %r14, -48
; VECTOR-NEXT: .cfi_offset %r15, -40
; VECTOR-NEXT: aghi %r15, -160
; VECTOR-NEXT: .cfi_def_cfa_offset 320
-; VECTOR-NEXT: vst %v24, 0(%r3), 3
-; VECTOR-NEXT: vl %v24, 0(%r2), 3
-; VECTOR-NEXT: brasl %r14, foo0 at PLT
+; VECTOR-NEXT: lgrl %r1, Src at GOT
+; VECTOR-NEXT: vl %v24, 0(%r1), 3
+; VECTOR-NEXT: brasl %r14, Fnptr at PLT
; VECTOR-NEXT: lmg %r14, %r15, 272(%r15)
; VECTOR-NEXT: br %r14
-;
-; SCALAR-LABEL: fun0:
-; SCALAR: # %bb.0:
-; SCALAR-NEXT: stmg %r14, %r15, 112(%r15)
-; SCALAR-NEXT: .cfi_offset %r14, -48
-; SCALAR-NEXT: .cfi_offset %r15, -40
-; SCALAR-NEXT: aghi %r15, -192
-; SCALAR-NEXT: .cfi_def_cfa_offset 352
-; SCALAR-NEXT: lgh %r0, 382(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f1, %r0
-; SCALAR-NEXT: lgh %r0, 374(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f3, %r0
-; SCALAR-NEXT: # kill: def $f0h killed $f0h def $f0d
-; SCALAR-NEXT: # kill: def $f2h killed $f2h def $f2d
-; SCALAR-NEXT: # kill: def $f4h killed $f4h def $f4d
-; SCALAR-NEXT: # kill: def $f6h killed $f6h def $f6d
-; SCALAR-NEXT: lgh %r0, 366(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f5, %r0
-; SCALAR-NEXT: lgh %r0, 358(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f7, %r0
-; SCALAR-NEXT: lgdr %r0, %f0
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 0(%r3)
-; SCALAR-NEXT: lgdr %r0, %f7
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 8(%r3)
-; SCALAR-NEXT: lgdr %r0, %f2
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 2(%r3)
-; SCALAR-NEXT: lgdr %r0, %f5
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 10(%r3)
-; SCALAR-NEXT: lgdr %r0, %f4
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 4(%r3)
-; SCALAR-NEXT: lgdr %r0, %f3
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 12(%r3)
-; SCALAR-NEXT: lgdr %r0, %f6
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 6(%r3)
-; SCALAR-NEXT: lgdr %r0, %f1
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 14(%r3)
-; SCALAR-NEXT: lgh %r0, 0(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f0, %r0
-; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
-; SCALAR-NEXT: lgh %r0, 2(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f2, %r0
-; SCALAR-NEXT: # kill: def $f2h killed $f2h killed $f2d
-; SCALAR-NEXT: lgh %r0, 4(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f4, %r0
-; SCALAR-NEXT: # kill: def $f4h killed $f4h killed $f4d
-; SCALAR-NEXT: lgh %r0, 6(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f6, %r0
-; SCALAR-NEXT: # kill: def $f6h killed $f6h killed $f6d
-; SCALAR-NEXT: lgh %r0, 8(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f1, %r0
-; SCALAR-NEXT: lgh %r0, 10(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f3, %r0
-; SCALAR-NEXT: lgh %r0, 12(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f5, %r0
-; SCALAR-NEXT: lgh %r0, 14(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f7, %r0
-; SCALAR-NEXT: lgdr %r0, %f7
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 190(%r15)
-; SCALAR-NEXT: lgdr %r0, %f5
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 182(%r15)
-; SCALAR-NEXT: lgdr %r0, %f3
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 174(%r15)
-; SCALAR-NEXT: lgdr %r0, %f1
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 166(%r15)
-; SCALAR-NEXT: brasl %r14, foo0 at PLT
-; SCALAR-NEXT: lmg %r14, %r15, 304(%r15)
-; SCALAR-NEXT: br %r14
- store <8 x half> %A, ptr %Dst
- %L = load <8 x half>, ptr %Src
- call void @foo0(<8 x half> %L)
+ %L = load %Ty0, ptr @Src
+ call void @Fnptr(%Ty0 %L)
ret void
}
-declare void @foo1(<4 x half>)
-define void @fun1(<4 x half> %A, ptr %Src, ptr %Dst) {
-; VECTOR-LABEL: fun1:
+define %Ty0 @fun0_ret() {
+; CHECK-LABEL: fun0_ret:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lgrl %r1, Src at GOT
+; CHECK-NEXT: lg %r0, 8(%r1)
+; CHECK-NEXT: lg %r1, 0(%r1)
+; CHECK-NEXT: stg %r0, 8(%r2)
+; CHECK-NEXT: stg %r1, 0(%r2)
+; CHECK-NEXT: br %r14
+;
+; VECTOR-LABEL: fun0_ret:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: lgrl %r1, Src at GOT
+; VECTOR-NEXT: vl %v24, 0(%r1), 3
+; VECTOR-NEXT: br %r14
+ %L = load %Ty0, ptr @Src
+ ret %Ty0 %L
+}
+
+define void @fun0_store_returned() {
+; CHECK-LABEL: fun0_store_returned:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -176
+; CHECK-NEXT: .cfi_def_cfa_offset 336
+; CHECK-NEXT: la %r2, 160(%r15)
+; CHECK-NEXT: brasl %r14, Fnptr at PLT
+; CHECK-NEXT: lg %r0, 168(%r15)
+; CHECK-NEXT: lgrl %r1, Dst at GOT
+; CHECK-NEXT: lg %r2, 160(%r15)
+; CHECK-NEXT: stg %r0, 8(%r1)
+; CHECK-NEXT: stg %r2, 0(%r1)
+; CHECK-NEXT: lmg %r14, %r15, 288(%r15)
+; CHECK-NEXT: br %r14
+;
+; VECTOR-LABEL: fun0_store_returned:
; VECTOR: # %bb.0:
; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
; VECTOR-NEXT: .cfi_offset %r14, -48
; VECTOR-NEXT: .cfi_offset %r15, -40
; VECTOR-NEXT: aghi %r15, -160
; VECTOR-NEXT: .cfi_def_cfa_offset 320
-; VECTOR-NEXT: vsteg %v24, 0(%r3), 0
-; VECTOR-NEXT: vlrepg %v24, 0(%r2)
-; VECTOR-NEXT: brasl %r14, foo0 at PLT
+; VECTOR-NEXT: brasl %r14, Fnptr at PLT
+; VECTOR-NEXT: lgrl %r1, Dst at GOT
+; VECTOR-NEXT: vst %v24, 0(%r1), 3
; VECTOR-NEXT: lmg %r14, %r15, 272(%r15)
; VECTOR-NEXT: br %r14
+ %C = call %Ty0 @Fnptr()
+ store %Ty0 %C, ptr @Dst
+ ret void
+}
+
+%Ty1 = type <4 x half>
+define void @fun1_arg(%Ty1 %A) {
+; CHECK-LABEL: fun1_arg:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lgrl %r1, Dst at GOT
+; CHECK-NEXT: # kill: def $f6h killed $f6h def $f6d
+; CHECK-NEXT: # kill: def $f4h killed $f4h def $f4d
+; CHECK-NEXT: # kill: def $f2h killed $f2h def $f2d
+; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d
+; CHECK-NEXT: lgdr %r0, %f6
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 6(%r1)
+; CHECK-NEXT: lgdr %r0, %f4
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 4(%r1)
+; CHECK-NEXT: lgdr %r0, %f2
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 2(%r1)
+; CHECK-NEXT: lgdr %r0, %f0
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 0(%r1)
+; CHECK-NEXT: br %r14
;
-; SCALAR-LABEL: fun1:
-; SCALAR: # %bb.0:
-; SCALAR-NEXT: stmg %r14, %r15, 112(%r15)
-; SCALAR-NEXT: .cfi_offset %r14, -48
-; SCALAR-NEXT: .cfi_offset %r15, -40
-; SCALAR-NEXT: aghi %r15, -160
-; SCALAR-NEXT: .cfi_def_cfa_offset 320
-; SCALAR-NEXT: # kill: def $f0h killed $f0h def $f0d
-; SCALAR-NEXT: lgdr %r0, %f0
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: # kill: def $f2h killed $f2h def $f2d
-; SCALAR-NEXT: sth %r0, 0(%r3)
-; SCALAR-NEXT: lgdr %r0, %f2
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: # kill: def $f4h killed $f4h def $f4d
-; SCALAR-NEXT: sth %r0, 2(%r3)
-; SCALAR-NEXT: # kill: def $f6h killed $f6h def $f6d
-; SCALAR-NEXT: lgdr %r0, %f4
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 4(%r3)
-; SCALAR-NEXT: lgdr %r0, %f6
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 6(%r3)
-; SCALAR-NEXT: lgh %r0, 0(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f0, %r0
-; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
-; SCALAR-NEXT: lgh %r0, 2(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f2, %r0
-; SCALAR-NEXT: # kill: def $f2h killed $f2h killed $f2d
-; SCALAR-NEXT: lgh %r0, 4(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f4, %r0
-; SCALAR-NEXT: # kill: def $f4h killed $f4h killed $f4d
-; SCALAR-NEXT: lgh %r0, 6(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f6, %r0
-; SCALAR-NEXT: # kill: def $f6h killed $f6h killed $f6d
-; SCALAR-NEXT: brasl %r14, foo0 at PLT
-; SCALAR-NEXT: lmg %r14, %r15, 272(%r15)
-; SCALAR-NEXT: br %r14
- store <4 x half> %A, ptr %Dst
- %L = load <4 x half>, ptr %Src
- call void @foo0(<4 x half> %L)
+; VECTOR-LABEL: fun1_arg:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: lgrl %r1, Dst at GOT
+; VECTOR-NEXT: vreph %v0, %v24, 1
+; VECTOR-NEXT: vreph %v1, %v24, 2
+; VECTOR-NEXT: vreph %v2, %v24, 3
+; VECTOR-NEXT: vsteh %v24, 0(%r1), 0
+; VECTOR-NEXT: vsteh %v2, 6(%r1), 0
+; VECTOR-NEXT: vsteh %v1, 4(%r1), 0
+; VECTOR-NEXT: vsteh %v0, 2(%r1), 0
+; VECTOR-NEXT: br %r14
+ store %Ty1 %A, ptr @Dst
ret void
}
-declare void @foo2(<16 x half>)
-define void @fun2(<16 x half> %A, ptr %Src, ptr %Dst) {
-; VECTOR-LABEL: fun2:
+define void @fun1_call() {
+; CHECK-LABEL: fun1_call:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: lgrl %r1, Src at GOT
+; CHECK-NEXT: lgh %r0, 0(%r1)
+; CHECK-NEXT: lgh %r2, 2(%r1)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f0, %r0
+; CHECK-NEXT: lgh %r0, 4(%r1)
+; CHECK-NEXT: sllg %r2, %r2, 48
+; CHECK-NEXT: lgh %r1, 6(%r1)
+; CHECK-NEXT: ldgr %f2, %r2
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f4, %r0
+; CHECK-NEXT: sllg %r0, %r1, 48
+; CHECK-NEXT: ldgr %f6, %r0
+; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; CHECK-NEXT: # kill: def $f2h killed $f2h killed $f2d
+; CHECK-NEXT: # kill: def $f4h killed $f4h killed $f4d
+; CHECK-NEXT: # kill: def $f6h killed $f6h killed $f6d
+; CHECK-NEXT: brasl %r14, Fnptr at PLT
+; CHECK-NEXT: lmg %r14, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+;
+; VECTOR-LABEL: fun1_call:
; VECTOR: # %bb.0:
; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
; VECTOR-NEXT: .cfi_offset %r14, -48
; VECTOR-NEXT: .cfi_offset %r15, -40
; VECTOR-NEXT: aghi %r15, -160
; VECTOR-NEXT: .cfi_def_cfa_offset 320
-; VECTOR-NEXT: vst %v24, 0(%r3), 4
-; VECTOR-NEXT: vst %v26, 16(%r3), 4
-; VECTOR-NEXT: vl %v24, 0(%r2), 4
-; VECTOR-NEXT: vl %v26, 16(%r2), 4
-; VECTOR-NEXT: brasl %r14, foo0 at PLT
+; VECTOR-NEXT: lgrl %r1, Src at GOT
+; VECTOR-NEXT: vlreph %v0, 0(%r1)
+; VECTOR-NEXT: vlreph %v1, 2(%r1)
+; VECTOR-NEXT: vlreph %v2, 4(%r1)
+; VECTOR-NEXT: vlreph %v3, 6(%r1)
+; VECTOR-NEXT: vmrhh %v2, %v2, %v3
+; VECTOR-NEXT: vmrhh %v0, %v0, %v1
+; VECTOR-NEXT: vmrhf %v0, %v0, %v2
+; VECTOR-NEXT: vmrhf %v1, %v0, %v0
+; VECTOR-NEXT: vmrhg %v24, %v0, %v1
+; VECTOR-NEXT: brasl %r14, Fnptr at PLT
; VECTOR-NEXT: lmg %r14, %r15, 272(%r15)
; VECTOR-NEXT: br %r14
-;
-; SCALAR-LABEL: fun2:
-; SCALAR: # %bb.0:
-; SCALAR-NEXT: stmg %r14, %r15, 112(%r15)
-; SCALAR-NEXT: .cfi_offset %r14, -48
-; SCALAR-NEXT: .cfi_offset %r15, -40
-; SCALAR-NEXT: aghi %r15, -320
-; SCALAR-NEXT: .cfi_def_cfa_offset 480
-; SCALAR-NEXT: std %f8, 312(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f9, 304(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f10, 296(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f11, 288(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f12, 280(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f13, 272(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f14, 264(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f15, 256(%r15) # 8-byte Spill
-; SCALAR-NEXT: .cfi_offset %f8, -168
-; SCALAR-NEXT: .cfi_offset %f9, -176
-; SCALAR-NEXT: .cfi_offset %f10, -184
-; SCALAR-NEXT: .cfi_offset %f11, -192
-; SCALAR-NEXT: .cfi_offset %f12, -200
-; SCALAR-NEXT: .cfi_offset %f13, -208
-; SCALAR-NEXT: .cfi_offset %f14, -216
-; SCALAR-NEXT: .cfi_offset %f15, -224
-; SCALAR-NEXT: # kill: def $f0h killed $f0h def $f0d
-; SCALAR-NEXT: # kill: def $f2h killed $f2h def $f2d
-; SCALAR-NEXT: # kill: def $f4h killed $f4h def $f4d
-; SCALAR-NEXT: # kill: def $f6h killed $f6h def $f6d
-; SCALAR-NEXT: lgh %r0, 574(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f1, %r0
-; SCALAR-NEXT: lgh %r0, 566(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f3, %r0
-; SCALAR-NEXT: lgh %r0, 558(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f5, %r0
-; SCALAR-NEXT: lgh %r0, 550(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f7, %r0
-; SCALAR-NEXT: lgh %r0, 542(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f8, %r0
-; SCALAR-NEXT: lgh %r0, 534(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f9, %r0
-; SCALAR-NEXT: lgh %r0, 526(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f10, %r0
-; SCALAR-NEXT: lgh %r0, 518(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f11, %r0
-; SCALAR-NEXT: lgh %r0, 510(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f12, %r0
-; SCALAR-NEXT: lgh %r0, 502(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f13, %r0
-; SCALAR-NEXT: lgh %r0, 494(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f14, %r0
-; SCALAR-NEXT: lgh %r0, 486(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f15, %r0
-; SCALAR-NEXT: lgdr %r0, %f0
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 0(%r3)
-; SCALAR-NEXT: lgdr %r0, %f2
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 2(%r3)
-; SCALAR-NEXT: lgdr %r0, %f4
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 4(%r3)
-; SCALAR-NEXT: lgdr %r0, %f6
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 6(%r3)
-; SCALAR-NEXT: lgdr %r0, %f15
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 8(%r3)
-; SCALAR-NEXT: lgdr %r0, %f14
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 10(%r3)
-; SCALAR-NEXT: lgdr %r0, %f13
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 12(%r3)
-; SCALAR-NEXT: lgdr %r0, %f12
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 14(%r3)
-; SCALAR-NEXT: lgdr %r0, %f11
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 16(%r3)
-; SCALAR-NEXT: lgdr %r0, %f10
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 18(%r3)
-; SCALAR-NEXT: lgdr %r0, %f9
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 20(%r3)
-; SCALAR-NEXT: lgdr %r0, %f8
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 22(%r3)
-; SCALAR-NEXT: lgdr %r0, %f7
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 24(%r3)
-; SCALAR-NEXT: lgdr %r0, %f5
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 26(%r3)
-; SCALAR-NEXT: lgdr %r0, %f3
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 28(%r3)
-; SCALAR-NEXT: lgdr %r0, %f1
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 30(%r3)
-; SCALAR-NEXT: lgh %r0, 0(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f0, %r0
-; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
-; SCALAR-NEXT: lgh %r0, 2(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f2, %r0
-; SCALAR-NEXT: # kill: def $f2h killed $f2h killed $f2d
-; SCALAR-NEXT: lgh %r0, 4(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f4, %r0
-; SCALAR-NEXT: # kill: def $f4h killed $f4h killed $f4d
-; SCALAR-NEXT: lgh %r0, 6(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f6, %r0
-; SCALAR-NEXT: # kill: def $f6h killed $f6h killed $f6d
-; SCALAR-NEXT: lgh %r0, 8(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f1, %r0
-; SCALAR-NEXT: lgh %r0, 10(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f3, %r0
-; SCALAR-NEXT: lgh %r0, 12(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f5, %r0
-; SCALAR-NEXT: lgh %r0, 14(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f7, %r0
-; SCALAR-NEXT: lgh %r0, 16(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f8, %r0
-; SCALAR-NEXT: lgh %r0, 18(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f9, %r0
-; SCALAR-NEXT: lgh %r0, 20(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f10, %r0
-; SCALAR-NEXT: lgh %r0, 22(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f11, %r0
-; SCALAR-NEXT: lgh %r0, 24(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f12, %r0
-; SCALAR-NEXT: lgh %r0, 26(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f13, %r0
-; SCALAR-NEXT: lgh %r0, 28(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f14, %r0
-; SCALAR-NEXT: lgh %r0, 30(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f15, %r0
-; SCALAR-NEXT: lgdr %r0, %f15
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 254(%r15)
-; SCALAR-NEXT: lgdr %r0, %f14
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 246(%r15)
-; SCALAR-NEXT: lgdr %r0, %f13
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 238(%r15)
-; SCALAR-NEXT: lgdr %r0, %f12
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 230(%r15)
-; SCALAR-NEXT: lgdr %r0, %f11
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 222(%r15)
-; SCALAR-NEXT: lgdr %r0, %f10
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 214(%r15)
-; SCALAR-NEXT: lgdr %r0, %f9
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 206(%r15)
-; SCALAR-NEXT: lgdr %r0, %f8
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 198(%r15)
-; SCALAR-NEXT: lgdr %r0, %f7
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 190(%r15)
-; SCALAR-NEXT: lgdr %r0, %f5
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 182(%r15)
-; SCALAR-NEXT: lgdr %r0, %f3
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 174(%r15)
-; SCALAR-NEXT: lgdr %r0, %f1
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 166(%r15)
-; SCALAR-NEXT: brasl %r14, foo0 at PLT
-; SCALAR-NEXT: ld %f8, 312(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f9, 304(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f10, 296(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f11, 288(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f12, 280(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f13, 272(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f14, 264(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f15, 256(%r15) # 8-byte Reload
-; SCALAR-NEXT: lmg %r14, %r15, 432(%r15)
-; SCALAR-NEXT: br %r14
- store <16 x half> %A, ptr %Dst
- %L = load <16 x half>, ptr %Src
- call void @foo0(<16 x half> %L)
+ %L = load %Ty1, ptr @Src
+ call void @Fnptr(%Ty1 %L)
ret void
}
-; Return in vector register.
-declare <8 x half> @foo3()
-define <8 x half> @fun3(ptr %Src, ptr %Dst) {
-; VECTOR-LABEL: fun3:
+define %Ty1 @fun1_ret() {
+; CHECK-LABEL: fun1_ret:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lgrl %r1, Src at GOT
+; CHECK-NEXT: lgh %r0, 0(%r1)
+; CHECK-NEXT: lgh %r2, 2(%r1)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f0, %r0
+; CHECK-NEXT: lgh %r0, 4(%r1)
+; CHECK-NEXT: sllg %r2, %r2, 48
+; CHECK-NEXT: lgh %r1, 6(%r1)
+; CHECK-NEXT: ldgr %f2, %r2
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f4, %r0
+; CHECK-NEXT: sllg %r0, %r1, 48
+; CHECK-NEXT: ldgr %f6, %r0
+; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; CHECK-NEXT: # kill: def $f2h killed $f2h killed $f2d
+; CHECK-NEXT: # kill: def $f4h killed $f4h killed $f4d
+; CHECK-NEXT: # kill: def $f6h killed $f6h killed $f6d
+; CHECK-NEXT: br %r14
+;
+; VECTOR-LABEL: fun1_ret:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: lgrl %r1, Src at GOT
+; VECTOR-NEXT: vlreph %v0, 0(%r1)
+; VECTOR-NEXT: vlreph %v1, 2(%r1)
+; VECTOR-NEXT: vlreph %v2, 4(%r1)
+; VECTOR-NEXT: vlreph %v3, 6(%r1)
+; VECTOR-NEXT: vmrhh %v2, %v2, %v3
+; VECTOR-NEXT: vmrhh %v0, %v0, %v1
+; VECTOR-NEXT: vmrhf %v0, %v0, %v2
+; VECTOR-NEXT: vmrhf %v1, %v0, %v0
+; VECTOR-NEXT: vmrhg %v24, %v0, %v1
+; VECTOR-NEXT: br %r14
+ %L = load %Ty1, ptr @Src
+ ret %Ty1 %L
+}
+
+define void @fun1_store_returned() {
+; CHECK-LABEL: fun1_store_returned:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: brasl %r14, Fnptr at PLT
+; CHECK-NEXT: lgrl %r1, Dst at GOT
+; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d
+; CHECK-NEXT: # kill: def $f2h killed $f2h def $f2d
+; CHECK-NEXT: # kill: def $f4h killed $f4h def $f4d
+; CHECK-NEXT: # kill: def $f6h killed $f6h def $f6d
+; CHECK-NEXT: lgdr %r0, %f6
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 6(%r1)
+; CHECK-NEXT: lgdr %r0, %f4
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 4(%r1)
+; CHECK-NEXT: lgdr %r0, %f2
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 2(%r1)
+; CHECK-NEXT: lgdr %r0, %f0
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 0(%r1)
+; CHECK-NEXT: lmg %r14, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+;
+; VECTOR-LABEL: fun1_store_returned:
; VECTOR: # %bb.0:
-; VECTOR-NEXT: stmg %r12, %r15, 96(%r15)
-; VECTOR-NEXT: .cfi_offset %r12, -64
-; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
; VECTOR-NEXT: .cfi_offset %r14, -48
; VECTOR-NEXT: .cfi_offset %r15, -40
; VECTOR-NEXT: aghi %r15, -160
; VECTOR-NEXT: .cfi_def_cfa_offset 320
-; VECTOR-NEXT: lgr %r13, %r3
-; VECTOR-NEXT: lgr %r12, %r2
-; VECTOR-NEXT: brasl %r14, foo3 at PLT
-; VECTOR-NEXT: vst %v24, 0(%r13), 3
-; VECTOR-NEXT: vl %v24, 0(%r12), 3
-; VECTOR-NEXT: lmg %r12, %r15, 256(%r15)
+; VECTOR-NEXT: brasl %r14, Fnptr at PLT
+; VECTOR-NEXT: lgrl %r1, Dst at GOT
+; VECTOR-NEXT: vreph %v0, %v24, 1
+; VECTOR-NEXT: vreph %v1, %v24, 2
+; VECTOR-NEXT: vreph %v2, %v24, 3
+; VECTOR-NEXT: vsteh %v24, 0(%r1), 0
+; VECTOR-NEXT: vsteh %v2, 6(%r1), 0
+; VECTOR-NEXT: vsteh %v1, 4(%r1), 0
+; VECTOR-NEXT: vsteh %v0, 2(%r1), 0
+; VECTOR-NEXT: lmg %r14, %r15, 272(%r15)
; VECTOR-NEXT: br %r14
+ %C = call %Ty1 @Fnptr()
+ store %Ty1 %C, ptr @Dst
+ ret void
+}
+
+%Ty2 = type <16 x half>
+define void @fun2_arg(%Ty2 %A) {
+; CHECK-LABEL: fun2_arg:
+; CHECK: # %bb.0:
+; CHECK-NEXT: aghi %r15, -64
+; CHECK-NEXT: .cfi_def_cfa_offset 224
+; CHECK-NEXT: std %f8, 56(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f9, 48(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f10, 40(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f11, 32(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f12, 24(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f13, 16(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f14, 8(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f15, 0(%r15) # 8-byte Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: .cfi_offset %f9, -176
+; CHECK-NEXT: .cfi_offset %f10, -184
+; CHECK-NEXT: .cfi_offset %f11, -192
+; CHECK-NEXT: .cfi_offset %f12, -200
+; CHECK-NEXT: .cfi_offset %f13, -208
+; CHECK-NEXT: .cfi_offset %f14, -216
+; CHECK-NEXT: .cfi_offset %f15, -224
+; CHECK-NEXT: lgh %r0, 230(%r15)
+; CHECK-NEXT: # kill: def $f6h killed $f6h def $f6d
+; CHECK-NEXT: # kill: def $f4h killed $f4h def $f4d
+; CHECK-NEXT: # kill: def $f2h killed $f2h def $f2d
+; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d
+; CHECK-NEXT: lgh %r1, 238(%r15)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f1, %r0
+; CHECK-NEXT: lgh %r0, 246(%r15)
+; CHECK-NEXT: sllg %r1, %r1, 48
+; CHECK-NEXT: ldgr %f3, %r1
+; CHECK-NEXT: lgh %r1, 254(%r15)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f5, %r0
+; CHECK-NEXT: lgh %r0, 262(%r15)
+; CHECK-NEXT: sllg %r1, %r1, 48
+; CHECK-NEXT: ldgr %f7, %r1
+; CHECK-NEXT: lgh %r1, 270(%r15)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f8, %r0
+; CHECK-NEXT: lgh %r0, 278(%r15)
+; CHECK-NEXT: sllg %r1, %r1, 48
+; CHECK-NEXT: ldgr %f9, %r1
+; CHECK-NEXT: lgh %r1, 286(%r15)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f10, %r0
+; CHECK-NEXT: lgh %r0, 294(%r15)
+; CHECK-NEXT: sllg %r1, %r1, 48
+; CHECK-NEXT: ldgr %f11, %r1
+; CHECK-NEXT: lgh %r1, 302(%r15)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f12, %r0
+; CHECK-NEXT: lgh %r0, 310(%r15)
+; CHECK-NEXT: sllg %r1, %r1, 48
+; CHECK-NEXT: lgh %r2, 318(%r15)
+; CHECK-NEXT: ldgr %f13, %r1
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f14, %r0
+; CHECK-NEXT: sllg %r0, %r2, 48
+; CHECK-NEXT: lgrl %r1, Dst at GOT
+; CHECK-NEXT: ldgr %f15, %r0
+; CHECK-NEXT: lgdr %r0, %f6
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 6(%r1)
+; CHECK-NEXT: lgdr %r0, %f4
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 4(%r1)
+; CHECK-NEXT: lgdr %r0, %f2
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 2(%r1)
+; CHECK-NEXT: lgdr %r0, %f0
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 0(%r1)
+; CHECK-NEXT: lgdr %r0, %f15
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 30(%r1)
+; CHECK-NEXT: lgdr %r0, %f14
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 28(%r1)
+; CHECK-NEXT: lgdr %r0, %f13
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 26(%r1)
+; CHECK-NEXT: lgdr %r0, %f12
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 24(%r1)
+; CHECK-NEXT: lgdr %r0, %f11
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 22(%r1)
+; CHECK-NEXT: lgdr %r0, %f10
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 20(%r1)
+; CHECK-NEXT: lgdr %r0, %f9
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 18(%r1)
+; CHECK-NEXT: lgdr %r0, %f8
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 16(%r1)
+; CHECK-NEXT: lgdr %r0, %f7
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 14(%r1)
+; CHECK-NEXT: lgdr %r0, %f5
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 12(%r1)
+; CHECK-NEXT: lgdr %r0, %f3
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 10(%r1)
+; CHECK-NEXT: lgdr %r0, %f1
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 8(%r1)
+; CHECK-NEXT: ld %f8, 56(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f9, 48(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f10, 40(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f11, 32(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f12, 24(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f13, 16(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f14, 8(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f15, 0(%r15) # 8-byte Reload
+; CHECK-NEXT: aghi %r15, 64
+; CHECK-NEXT: br %r14
;
-; SCALAR-LABEL: fun3:
-; SCALAR: # %bb.0:
-; SCALAR-NEXT: stmg %r11, %r15, 88(%r15)
-; SCALAR-NEXT: .cfi_offset %r11, -72
-; SCALAR-NEXT: .cfi_offset %r12, -64
-; SCALAR-NEXT: .cfi_offset %r13, -56
-; SCALAR-NEXT: .cfi_offset %r14, -48
-; SCALAR-NEXT: .cfi_offset %r15, -40
-; SCALAR-NEXT: aghi %r15, -176
-; SCALAR-NEXT: .cfi_def_cfa_offset 336
-; SCALAR-NEXT: lgr %r13, %r2
-; SCALAR-NEXT: la %r2, 160(%r15)
-; SCALAR-NEXT: lgr %r11, %r4
-; SCALAR-NEXT: lgr %r12, %r3
-; SCALAR-NEXT: brasl %r14, foo3 at PLT
-; SCALAR-NEXT: lgh %r0, 160(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f0, %r0
-; SCALAR-NEXT: lgh %r0, 162(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f1, %r0
-; SCALAR-NEXT: lgh %r0, 164(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f2, %r0
-; SCALAR-NEXT: lgh %r0, 166(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f3, %r0
-; SCALAR-NEXT: lgh %r0, 168(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f4, %r0
-; SCALAR-NEXT: lgh %r0, 170(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f5, %r0
-; SCALAR-NEXT: lgh %r0, 172(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f6, %r0
-; SCALAR-NEXT: lgh %r0, 174(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f7, %r0
-; SCALAR-NEXT: lgdr %r0, %f7
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 14(%r11)
-; SCALAR-NEXT: lgdr %r0, %f6
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 12(%r11)
-; SCALAR-NEXT: lgdr %r0, %f5
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 10(%r11)
-; SCALAR-NEXT: lgdr %r0, %f4
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 8(%r11)
-; SCALAR-NEXT: lgdr %r0, %f3
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 6(%r11)
-; SCALAR-NEXT: lgdr %r0, %f2
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 4(%r11)
-; SCALAR-NEXT: lgdr %r0, %f1
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 2(%r11)
-; SCALAR-NEXT: lgdr %r0, %f0
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 0(%r11)
-; SCALAR-NEXT: lg %r0, 0(%r12)
-; SCALAR-NEXT: lg %r1, 8(%r12)
-; SCALAR-NEXT: stg %r1, 8(%r13)
-; SCALAR-NEXT: stg %r0, 0(%r13)
-; SCALAR-NEXT: lmg %r11, %r15, 264(%r15)
-; SCALAR-NEXT: br %r14
- %V = call <8 x half> @foo3()
- store <8 x half> %V, ptr %Dst
- %L = load <8 x half>, ptr %Src
- ret <8 x half> %L
+; VECTOR-LABEL: fun2_arg:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: lgrl %r1, Dst at GOT
+; VECTOR-NEXT: vst %v26, 16(%r1), 4
+; VECTOR-NEXT: vst %v24, 0(%r1), 4
+; VECTOR-NEXT: br %r14
+ store %Ty2 %A, ptr @Dst
+ ret void
}
-declare <4 x half> @foo4()
-define <4 x half> @fun4(ptr %Src, ptr %Dst) {
-; VECTOR-LABEL: fun4:
+define void @fun2_call() {
+; CHECK-LABEL: fun2_call:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -320
+; CHECK-NEXT: .cfi_def_cfa_offset 480
+; CHECK-NEXT: std %f8, 312(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f9, 304(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f10, 296(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f11, 288(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f12, 280(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f13, 272(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f14, 264(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f15, 256(%r15) # 8-byte Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: .cfi_offset %f9, -176
+; CHECK-NEXT: .cfi_offset %f10, -184
+; CHECK-NEXT: .cfi_offset %f11, -192
+; CHECK-NEXT: .cfi_offset %f12, -200
+; CHECK-NEXT: .cfi_offset %f13, -208
+; CHECK-NEXT: .cfi_offset %f14, -216
+; CHECK-NEXT: .cfi_offset %f15, -224
+; CHECK-NEXT: lgrl %r1, Src at GOT
+; CHECK-NEXT: lgh %r0, 0(%r1)
+; CHECK-NEXT: lgh %r2, 2(%r1)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f0, %r0
+; CHECK-NEXT: lgh %r0, 4(%r1)
+; CHECK-NEXT: sllg %r2, %r2, 48
+; CHECK-NEXT: ldgr %f2, %r2
+; CHECK-NEXT: lgh %r2, 6(%r1)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f4, %r0
+; CHECK-NEXT: lgh %r0, 8(%r1)
+; CHECK-NEXT: sllg %r2, %r2, 48
+; CHECK-NEXT: ldgr %f6, %r2
+; CHECK-NEXT: lgh %r2, 10(%r1)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f1, %r0
+; CHECK-NEXT: lgh %r0, 12(%r1)
+; CHECK-NEXT: sllg %r2, %r2, 48
+; CHECK-NEXT: ldgr %f3, %r2
+; CHECK-NEXT: lgh %r2, 14(%r1)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f5, %r0
+; CHECK-NEXT: lgh %r0, 16(%r1)
+; CHECK-NEXT: sllg %r2, %r2, 48
+; CHECK-NEXT: ldgr %f7, %r2
+; CHECK-NEXT: lgh %r2, 18(%r1)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f8, %r0
+; CHECK-NEXT: lgh %r0, 20(%r1)
+; CHECK-NEXT: sllg %r2, %r2, 48
+; CHECK-NEXT: ldgr %f9, %r2
+; CHECK-NEXT: lgh %r2, 22(%r1)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f10, %r0
+; CHECK-NEXT: lgh %r0, 24(%r1)
+; CHECK-NEXT: sllg %r2, %r2, 48
+; CHECK-NEXT: ldgr %f11, %r2
+; CHECK-NEXT: lgh %r2, 26(%r1)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f12, %r0
+; CHECK-NEXT: lgh %r0, 28(%r1)
+; CHECK-NEXT: sllg %r2, %r2, 48
+; CHECK-NEXT: lgh %r1, 30(%r1)
+; CHECK-NEXT: ldgr %f13, %r2
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f14, %r0
+; CHECK-NEXT: sllg %r0, %r1, 48
+; CHECK-NEXT: ldgr %f15, %r0
+; CHECK-NEXT: lgdr %r0, %f15
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 254(%r15)
+; CHECK-NEXT: lgdr %r0, %f14
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 246(%r15)
+; CHECK-NEXT: lgdr %r0, %f13
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 238(%r15)
+; CHECK-NEXT: lgdr %r0, %f12
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 230(%r15)
+; CHECK-NEXT: lgdr %r0, %f11
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 222(%r15)
+; CHECK-NEXT: lgdr %r0, %f10
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 214(%r15)
+; CHECK-NEXT: lgdr %r0, %f9
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 206(%r15)
+; CHECK-NEXT: lgdr %r0, %f8
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 198(%r15)
+; CHECK-NEXT: lgdr %r0, %f7
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 190(%r15)
+; CHECK-NEXT: lgdr %r0, %f5
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 182(%r15)
+; CHECK-NEXT: lgdr %r0, %f3
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 174(%r15)
+; CHECK-NEXT: lgdr %r0, %f1
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 166(%r15)
+; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; CHECK-NEXT: # kill: def $f2h killed $f2h killed $f2d
+; CHECK-NEXT: # kill: def $f4h killed $f4h killed $f4d
+; CHECK-NEXT: # kill: def $f6h killed $f6h killed $f6d
+; CHECK-NEXT: brasl %r14, Fnptr at PLT
+; CHECK-NEXT: ld %f8, 312(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f9, 304(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f10, 296(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f11, 288(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f12, 280(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f13, 272(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f14, 264(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f15, 256(%r15) # 8-byte Reload
+; CHECK-NEXT: lmg %r14, %r15, 432(%r15)
+; CHECK-NEXT: br %r14
+;
+; VECTOR-LABEL: fun2_call:
; VECTOR: # %bb.0:
-; VECTOR-NEXT: stmg %r12, %r15, 96(%r15)
-; VECTOR-NEXT: .cfi_offset %r12, -64
-; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
; VECTOR-NEXT: .cfi_offset %r14, -48
; VECTOR-NEXT: .cfi_offset %r15, -40
; VECTOR-NEXT: aghi %r15, -160
; VECTOR-NEXT: .cfi_def_cfa_offset 320
-; VECTOR-NEXT: lgr %r13, %r3
-; VECTOR-NEXT: lgr %r12, %r2
-; VECTOR-NEXT: brasl %r14, foo4 at PLT
-; VECTOR-NEXT: vsteg %v24, 0(%r13), 0
-; VECTOR-NEXT: vlrepg %v24, 0(%r12)
-; VECTOR-NEXT: lmg %r12, %r15, 256(%r15)
+; VECTOR-NEXT: lgrl %r1, Src at GOT
+; VECTOR-NEXT: vl %v26, 16(%r1), 4
+; VECTOR-NEXT: vl %v24, 0(%r1), 4
+; VECTOR-NEXT: brasl %r14, Fnptr at PLT
+; VECTOR-NEXT: lmg %r14, %r15, 272(%r15)
; VECTOR-NEXT: br %r14
+ %L = load %Ty2, ptr @Src
+ call void @Fnptr(%Ty2 %L)
+ ret void
+}
+
+define %Ty2 @fun2_ret() {
+; CHECK-LABEL: fun2_ret:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lgrl %r1, Src at GOT
+; CHECK-NEXT: lg %r0, 24(%r1)
+; CHECK-NEXT: lg %r3, 16(%r1)
+; CHECK-NEXT: lg %r4, 8(%r1)
+; CHECK-NEXT: lg %r1, 0(%r1)
+; CHECK-NEXT: stg %r0, 24(%r2)
+; CHECK-NEXT: stg %r3, 16(%r2)
+; CHECK-NEXT: stg %r4, 8(%r2)
+; CHECK-NEXT: stg %r1, 0(%r2)
+; CHECK-NEXT: br %r14
;
-; SCALAR-LABEL: fun4:
-; SCALAR: # %bb.0:
-; SCALAR-NEXT: stmg %r12, %r15, 96(%r15)
-; SCALAR-NEXT: .cfi_offset %r12, -64
-; SCALAR-NEXT: .cfi_offset %r13, -56
-; SCALAR-NEXT: .cfi_offset %r14, -48
-; SCALAR-NEXT: .cfi_offset %r15, -40
-; SCALAR-NEXT: aghi %r15, -160
-; SCALAR-NEXT: .cfi_def_cfa_offset 320
-; SCALAR-NEXT: lgr %r12, %r3
-; SCALAR-NEXT: lgr %r13, %r2
-; SCALAR-NEXT: brasl %r14, foo4 at PLT
-; SCALAR-NEXT: # kill: def $f0h killed $f0h def $f0d
-; SCALAR-NEXT: lgdr %r0, %f0
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: # kill: def $f2h killed $f2h def $f2d
-; SCALAR-NEXT: sth %r0, 0(%r12)
-; SCALAR-NEXT: lgdr %r0, %f2
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: # kill: def $f4h killed $f4h def $f4d
-; SCALAR-NEXT: sth %r0, 2(%r12)
-; SCALAR-NEXT: lgdr %r0, %f4
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: # kill: def $f6h killed $f6h def $f6d
-; SCALAR-NEXT: sth %r0, 4(%r12)
-; SCALAR-NEXT: lgdr %r0, %f6
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 6(%r12)
-; SCALAR-NEXT: lgh %r0, 0(%r13)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f0, %r0
-; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
-; SCALAR-NEXT: lgh %r0, 2(%r13)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f2, %r0
-; SCALAR-NEXT: # kill: def $f2h killed $f2h killed $f2d
-; SCALAR-NEXT: lgh %r0, 4(%r13)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f4, %r0
-; SCALAR-NEXT: # kill: def $f4h killed $f4h killed $f4d
-; SCALAR-NEXT: lgh %r0, 6(%r13)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f6, %r0
-; SCALAR-NEXT: # kill: def $f6h killed $f6h killed $f6d
-; SCALAR-NEXT: lmg %r12, %r15, 256(%r15)
-; SCALAR-NEXT: br %r14
- %V = call <4 x half> @foo4()
- store <4 x half> %V, ptr %Dst
- %L = load <4 x half>, ptr %Src
- ret <4 x half> %L
+; VECTOR-LABEL: fun2_ret:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: lgrl %r1, Src at GOT
+; VECTOR-NEXT: vl %v24, 0(%r1), 4
+; VECTOR-NEXT: vl %v26, 16(%r1), 4
+; VECTOR-NEXT: br %r14
+ %L = load %Ty2, ptr @Src
+ ret %Ty2 %L
}
-declare <16 x half> @foo5()
-define <16 x half> @fun5(ptr %Src, ptr %Dst) {
-; VECTOR-LABEL: fun5:
+define void @fun2_store_returned() {
+; CHECK-LABEL: fun2_store_returned:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -192
+; CHECK-NEXT: .cfi_def_cfa_offset 352
+; CHECK-NEXT: la %r2, 160(%r15)
+; CHECK-NEXT: brasl %r14, Fnptr at PLT
+; CHECK-NEXT: lg %r0, 184(%r15)
+; CHECK-NEXT: lgrl %r1, Dst at GOT
+; CHECK-NEXT: lg %r2, 176(%r15)
+; CHECK-NEXT: lg %r3, 168(%r15)
+; CHECK-NEXT: lg %r4, 160(%r15)
+; CHECK-NEXT: stg %r0, 24(%r1)
+; CHECK-NEXT: stg %r2, 16(%r1)
+; CHECK-NEXT: stg %r3, 8(%r1)
+; CHECK-NEXT: stg %r4, 0(%r1)
+; CHECK-NEXT: lmg %r14, %r15, 304(%r15)
+; CHECK-NEXT: br %r14
+;
+; VECTOR-LABEL: fun2_store_returned:
; VECTOR: # %bb.0:
-; VECTOR-NEXT: stmg %r12, %r15, 96(%r15)
-; VECTOR-NEXT: .cfi_offset %r12, -64
-; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
; VECTOR-NEXT: .cfi_offset %r14, -48
; VECTOR-NEXT: .cfi_offset %r15, -40
; VECTOR-NEXT: aghi %r15, -160
; VECTOR-NEXT: .cfi_def_cfa_offset 320
-; VECTOR-NEXT: lgr %r13, %r3
-; VECTOR-NEXT: lgr %r12, %r2
-; VECTOR-NEXT: brasl %r14, foo5 at PLT
-; VECTOR-NEXT: vst %v24, 0(%r13), 4
-; VECTOR-NEXT: vst %v26, 16(%r13), 4
-; VECTOR-NEXT: vl %v24, 0(%r12), 4
-; VECTOR-NEXT: vl %v26, 16(%r12), 4
-; VECTOR-NEXT: lmg %r12, %r15, 256(%r15)
+; VECTOR-NEXT: brasl %r14, Fnptr at PLT
+; VECTOR-NEXT: lgrl %r1, Dst at GOT
+; VECTOR-NEXT: vst %v26, 16(%r1), 4
+; VECTOR-NEXT: vst %v24, 0(%r1), 4
+; VECTOR-NEXT: lmg %r14, %r15, 272(%r15)
; VECTOR-NEXT: br %r14
-;
-; SCALAR-LABEL: fun5:
-; SCALAR: # %bb.0:
-; SCALAR-NEXT: stmg %r11, %r15, 88(%r15)
-; SCALAR-NEXT: .cfi_offset %r11, -72
-; SCALAR-NEXT: .cfi_offset %r12, -64
-; SCALAR-NEXT: .cfi_offset %r13, -56
-; SCALAR-NEXT: .cfi_offset %r14, -48
-; SCALAR-NEXT: .cfi_offset %r15, -40
-; SCALAR-NEXT: aghi %r15, -192
-; SCALAR-NEXT: .cfi_def_cfa_offset 352
-; SCALAR-NEXT: lgr %r11, %r2
-; SCALAR-NEXT: la %r2, 160(%r15)
-; SCALAR-NEXT: lgr %r13, %r4
-; SCALAR-NEXT: lgr %r12, %r3
-; SCALAR-NEXT: brasl %r14, foo5 at PLT
-; SCALAR-NEXT: lg %r0, 160(%r15)
-; SCALAR-NEXT: lg %r1, 168(%r15)
-; SCALAR-NEXT: lg %r2, 176(%r15)
-; SCALAR-NEXT: lg %r3, 184(%r15)
-; SCALAR-NEXT: stg %r3, 24(%r13)
-; SCALAR-NEXT: stg %r2, 16(%r13)
-; SCALAR-NEXT: stg %r1, 8(%r13)
-; SCALAR-NEXT: stg %r0, 0(%r13)
-; SCALAR-NEXT: lg %r0, 24(%r12)
-; SCALAR-NEXT: lg %r1, 16(%r12)
-; SCALAR-NEXT: lg %r2, 8(%r12)
-; SCALAR-NEXT: lg %r3, 0(%r12)
-; SCALAR-NEXT: stg %r3, 0(%r11)
-; SCALAR-NEXT: stg %r2, 8(%r11)
-; SCALAR-NEXT: stg %r1, 16(%r11)
-; SCALAR-NEXT: stg %r0, 24(%r11)
-; SCALAR-NEXT: lmg %r11, %r15, 280(%r15)
-; SCALAR-NEXT: br %r14
- %V = call <16 x half> @foo5()
- store <16 x half> %V, ptr %Dst
- %L = load <16 x half>, ptr %Src
- ret <16 x half> %L
+ %C = call %Ty2 @Fnptr()
+ store %Ty2 %C, ptr @Dst
+ ret void
}
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-binops.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-binops.ll
index ad0a5cac5cc08..825472299d028 100644
--- a/llvm/test/CodeGen/SystemZ/fp-half-vector-binops.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-binops.ll
@@ -1,23 +1,207 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
-; RUN: | FileCheck %s --check-prefix=VECTOR
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \
-; RUN: | FileCheck %s --check-prefix=SCALAR
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s --check-prefix=VECTOR
+;
+; Test some fp16 vector operations, which must be scalarized. With less than
+; 8 elements there should only be operations emitted for the used elements.
-; Scalarized operations, full vector.
-define <8 x half> @fun0(<8 x half> %LHS, <8 x half> %RHS) {
+%Ty0 = type <8 x half>
+define void @fun0(ptr %Src, ptr %Dst) {
+; CHECK-LABEL: fun0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
+; CHECK-NEXT: .cfi_offset %r13, -56
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -288
+; CHECK-NEXT: .cfi_def_cfa_offset 448
+; CHECK-NEXT: std %f8, 280(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f9, 272(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f10, 264(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f11, 256(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f12, 248(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f13, 240(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f14, 232(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f15, 224(%r15) # 8-byte Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: .cfi_offset %f9, -176
+; CHECK-NEXT: .cfi_offset %f10, -184
+; CHECK-NEXT: .cfi_offset %f11, -192
+; CHECK-NEXT: .cfi_offset %f12, -200
+; CHECK-NEXT: .cfi_offset %f13, -208
+; CHECK-NEXT: .cfi_offset %f14, -216
+; CHECK-NEXT: .cfi_offset %f15, -224
+; CHECK-NEXT: lgh %r0, 14(%r2)
+; CHECK-NEXT: lgr %r13, %r3
+; CHECK-NEXT: lgh %r1, 12(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: stg %r0, 216(%r15) # 8-byte Spill
+; CHECK-NEXT: lgh %r0, 10(%r2)
+; CHECK-NEXT: sllg %r1, %r1, 48
+; CHECK-NEXT: stg %r1, 208(%r15) # 8-byte Spill
+; CHECK-NEXT: lgh %r1, 8(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: stg %r0, 200(%r15) # 8-byte Spill
+; CHECK-NEXT: lgh %r0, 6(%r2)
+; CHECK-NEXT: sllg %r1, %r1, 48
+; CHECK-NEXT: stg %r1, 192(%r15) # 8-byte Spill
+; CHECK-NEXT: lgh %r1, 4(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: stg %r0, 176(%r15) # 8-byte Spill
+; CHECK-NEXT: lgh %r0, 2(%r2)
+; CHECK-NEXT: sllg %r1, %r1, 48
+; CHECK-NEXT: stg %r1, 160(%r15) # 8-byte Spill
+; CHECK-NEXT: lgh %r1, 0(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f8, %r0
+; CHECK-NEXT: lgh %r0, 30(%r2)
+; CHECK-NEXT: sllg %r1, %r1, 48
+; CHECK-NEXT: ldgr %f13, %r1
+; CHECK-NEXT: lgh %r1, 28(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: stg %r0, 184(%r15) # 8-byte Spill
+; CHECK-NEXT: lgh %r0, 26(%r2)
+; CHECK-NEXT: sllg %r1, %r1, 48
+; CHECK-NEXT: stg %r1, 168(%r15) # 8-byte Spill
+; CHECK-NEXT: lgh %r1, 24(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: lgh %r3, 22(%r2)
+; CHECK-NEXT: ldgr %f10, %r0
+; CHECK-NEXT: sllg %r0, %r1, 48
+; CHECK-NEXT: ldgr %f11, %r0
+; CHECK-NEXT: sllg %r0, %r3, 48
+; CHECK-NEXT: lgh %r1, 20(%r2)
+; CHECK-NEXT: ldgr %f12, %r0
+; CHECK-NEXT: lgh %r0, 18(%r2)
+; CHECK-NEXT: lgh %r2, 16(%r2)
+; CHECK-NEXT: sllg %r1, %r1, 48
+; CHECK-NEXT: ldgr %f14, %r1
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: sllg %r1, %r2, 48
+; CHECK-NEXT: ldgr %f0, %r1
+; CHECK-NEXT: ldgr %f15, %r0
+; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f9, %f0
+; CHECK-NEXT: ler %f0, %f13
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: aebr %f0, %f9
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ler %f13, %f0
+; CHECK-NEXT: ler %f0, %f15
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f9, %f0
+; CHECK-NEXT: ler %f0, %f8
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: aebr %f0, %f9
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ler %f8, %f0
+; CHECK-NEXT: ler %f0, %f14
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f9, %f0
+; CHECK-NEXT: ld %f0, 160(%r15) # 8-byte Reload
+; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: aebr %f0, %f9
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ler %f9, %f0
+; CHECK-NEXT: ler %f0, %f12
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f12, %f0
+; CHECK-NEXT: ld %f0, 176(%r15) # 8-byte Reload
+; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: aebr %f0, %f12
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ler %f14, %f0
+; CHECK-NEXT: ler %f0, %f11
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f11, %f0
+; CHECK-NEXT: ld %f0, 192(%r15) # 8-byte Reload
+; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: aebr %f0, %f11
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ler %f11, %f0
+; CHECK-NEXT: ler %f0, %f10
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f10, %f0
+; CHECK-NEXT: ld %f0, 200(%r15) # 8-byte Reload
+; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: aebr %f0, %f10
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ler %f10, %f0
+; CHECK-NEXT: ld %f0, 168(%r15) # 8-byte Reload
+; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f12, %f0
+; CHECK-NEXT: ld %f0, 208(%r15) # 8-byte Reload
+; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: aebr %f0, %f12
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ler %f12, %f0
+; CHECK-NEXT: ld %f0, 184(%r15) # 8-byte Reload
+; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f15, %f0
+; CHECK-NEXT: ld %f0, 216(%r15) # 8-byte Reload
+; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: aebr %f0, %f15
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d
+; CHECK-NEXT: lgdr %r0, %f0
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 14(%r13)
+; CHECK-NEXT: lgdr %r0, %f12
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 12(%r13)
+; CHECK-NEXT: lgdr %r0, %f10
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 10(%r13)
+; CHECK-NEXT: lgdr %r0, %f11
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 8(%r13)
+; CHECK-NEXT: lgdr %r0, %f14
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 6(%r13)
+; CHECK-NEXT: lgdr %r0, %f9
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 4(%r13)
+; CHECK-NEXT: lgdr %r0, %f8
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 2(%r13)
+; CHECK-NEXT: lgdr %r0, %f13
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 0(%r13)
+; CHECK-NEXT: ld %f8, 280(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f9, 272(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f10, 264(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f11, 256(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f12, 248(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f13, 240(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f14, 232(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f15, 224(%r15) # 8-byte Reload
+; CHECK-NEXT: lmg %r13, %r15, 392(%r15)
+; CHECK-NEXT: br %r14
+;
; VECTOR-LABEL: fun0:
; VECTOR: # %bb.0:
-; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
; VECTOR-NEXT: .cfi_offset %r14, -48
; VECTOR-NEXT: .cfi_offset %r15, -40
; VECTOR-NEXT: aghi %r15, -248
; VECTOR-NEXT: .cfi_def_cfa_offset 408
; VECTOR-NEXT: std %f8, 240(%r15) # 8-byte Spill
; VECTOR-NEXT: .cfi_offset %f8, -168
-; VECTOR-NEXT: vst %v26, 176(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vst %v24, 160(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vreph %v0, %v26, 7
+; VECTOR-NEXT: vl %v0, 16(%r2), 3
+; VECTOR-NEXT: mvc 160(16,%r15), 0(%r2) # 16-byte Folded Spill
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vreph %v0, %v0, 7
; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
; VECTOR-NEXT: ldr %f8, %f0
@@ -131,758 +315,205 @@ define <8 x half> @fun0(<8 x half> %LHS, <8 x half> %RHS) {
; VECTOR-NEXT: vmrhf %v0, %v0, %v1
; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload
; VECTOR-NEXT: ld %f8, 240(%r15) # 8-byte Reload
-; VECTOR-NEXT: vmrhg %v24, %v0, %v1
-; VECTOR-NEXT: lmg %r14, %r15, 360(%r15)
+; VECTOR-NEXT: vmrhg %v0, %v0, %v1
+; VECTOR-NEXT: vst %v0, 0(%r13), 3
+; VECTOR-NEXT: lmg %r13, %r15, 352(%r15)
; VECTOR-NEXT: br %r14
-;
-; SCALAR-LABEL: fun0:
-; SCALAR: # %bb.0:
-; SCALAR-NEXT: stmg %r13, %r15, 104(%r15)
-; SCALAR-NEXT: .cfi_offset %r13, -56
-; SCALAR-NEXT: .cfi_offset %r14, -48
-; SCALAR-NEXT: .cfi_offset %r15, -40
-; SCALAR-NEXT: aghi %r15, -288
-; SCALAR-NEXT: .cfi_def_cfa_offset 448
-; SCALAR-NEXT: std %f8, 280(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f9, 272(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f10, 264(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f11, 256(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f12, 248(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f13, 240(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f14, 232(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f15, 224(%r15) # 8-byte Spill
-; SCALAR-NEXT: .cfi_offset %f8, -168
-; SCALAR-NEXT: .cfi_offset %f9, -176
-; SCALAR-NEXT: .cfi_offset %f10, -184
-; SCALAR-NEXT: .cfi_offset %f11, -192
-; SCALAR-NEXT: .cfi_offset %f12, -200
-; SCALAR-NEXT: .cfi_offset %f13, -208
-; SCALAR-NEXT: .cfi_offset %f14, -216
-; SCALAR-NEXT: .cfi_offset %f15, -224
-; SCALAR-NEXT: lgh %r0, 478(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: stg %r0, 216(%r15) # 8-byte Spill
-; SCALAR-NEXT: lgh %r0, 542(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: stg %r0, 208(%r15) # 8-byte Spill
-; SCALAR-NEXT: lgh %r0, 470(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: stg %r0, 192(%r15) # 8-byte Spill
-; SCALAR-NEXT: lgh %r0, 534(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: stg %r0, 184(%r15) # 8-byte Spill
-; SCALAR-NEXT: lgh %r0, 462(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: stg %r0, 176(%r15) # 8-byte Spill
-; SCALAR-NEXT: lgh %r0, 526(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: stg %r0, 168(%r15) # 8-byte Spill
-; SCALAR-NEXT: lgh %r0, 454(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f10, %r0
-; SCALAR-NEXT: lgh %r0, 518(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f11, %r0
-; SCALAR-NEXT: lgh %r0, 510(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f14, %r0
-; SCALAR-NEXT: lgh %r0, 502(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f15, %r0
-; SCALAR-NEXT: lgh %r0, 494(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f8, %r0
-; SCALAR-NEXT: lgh %r0, 486(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ste %f6, 164(%r15) # 4-byte Spill
-; SCALAR-NEXT: ste %f4, 160(%r15) # 4-byte Spill
-; SCALAR-NEXT: ler %f13, %f2
-; SCALAR-NEXT: ler %f12, %f0
-; SCALAR-NEXT: lgr %r13, %r2
-; SCALAR-NEXT: ldgr %f0, %r0
-; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ler %f9, %f0
-; SCALAR-NEXT: ler %f0, %f12
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: aebr %f0, %f9
-; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; SCALAR-NEXT: # kill: def $f0h killed $f0h def $f0d
-; SCALAR-NEXT: std %f0, 200(%r15) # 8-byte Spill
-; SCALAR-NEXT: ler %f0, %f8
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ler %f8, %f0
-; SCALAR-NEXT: ler %f0, %f13
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: aebr %f0, %f8
-; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; SCALAR-NEXT: ler %f13, %f0
-; SCALAR-NEXT: ler %f0, %f15
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ler %f8, %f0
-; SCALAR-NEXT: le %f0, 160(%r15) # 4-byte Reload
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: aebr %f0, %f8
-; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; SCALAR-NEXT: ler %f9, %f0
-; SCALAR-NEXT: ler %f0, %f14
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ler %f8, %f0
-; SCALAR-NEXT: le %f0, 164(%r15) # 4-byte Reload
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: aebr %f0, %f8
-; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; SCALAR-NEXT: ler %f14, %f0
-; SCALAR-NEXT: ler %f0, %f11
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ler %f8, %f0
-; SCALAR-NEXT: ler %f0, %f10
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: aebr %f0, %f8
-; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; SCALAR-NEXT: ler %f10, %f0
-; SCALAR-NEXT: ld %f0, 168(%r15) # 8-byte Reload
-; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ler %f8, %f0
-; SCALAR-NEXT: ld %f0, 176(%r15) # 8-byte Reload
-; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: aebr %f0, %f8
-; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; SCALAR-NEXT: ler %f8, %f0
-; SCALAR-NEXT: ld %f0, 184(%r15) # 8-byte Reload
-; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ler %f11, %f0
-; SCALAR-NEXT: ld %f0, 192(%r15) # 8-byte Reload
-; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: aebr %f0, %f11
-; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; SCALAR-NEXT: ler %f11, %f0
-; SCALAR-NEXT: ld %f0, 208(%r15) # 8-byte Reload
-; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ler %f12, %f0
-; SCALAR-NEXT: ld %f0, 216(%r15) # 8-byte Reload
-; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: aebr %f0, %f12
-; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; SCALAR-NEXT: # kill: def $f0h killed $f0h def $f0d
-; SCALAR-NEXT: lgdr %r0, %f0
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 14(%r13)
-; SCALAR-NEXT: lgdr %r0, %f11
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 12(%r13)
-; SCALAR-NEXT: lgdr %r0, %f8
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 10(%r13)
-; SCALAR-NEXT: lgdr %r0, %f10
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 8(%r13)
-; SCALAR-NEXT: lgdr %r0, %f14
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 6(%r13)
-; SCALAR-NEXT: lgdr %r0, %f9
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 4(%r13)
-; SCALAR-NEXT: lgdr %r0, %f13
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 2(%r13)
-; SCALAR-NEXT: lg %r0, 200(%r15) # 8-byte Reload
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 0(%r13)
-; SCALAR-NEXT: ld %f8, 280(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f9, 272(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f10, 264(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f11, 256(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f12, 248(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f13, 240(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f14, 232(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f15, 224(%r15) # 8-byte Reload
-; SCALAR-NEXT: lmg %r13, %r15, 392(%r15)
-; SCALAR-NEXT: br %r14
- %Res = fadd <8 x half> %LHS, %RHS
- ret <8 x half> %Res
+ %LHS = load %Ty0, ptr %Src
+ %S2 = getelementptr %Ty0, ptr %Src, i32 1
+ %RHS = load %Ty0, ptr %S2
+ %Res = fadd %Ty0 %LHS, %RHS
+ store %Ty0 %Res, ptr %Dst
+ ret void
}
-; Scalarized operations, partial vector. TODO: The v4f16 is first widened and
-; then scalarized, which unfortunately results in 8 scalar operations. Maybe
-; the DAGCombiner could be helped to handle EXTRACT_SUBVECTOR in this where
-; the operands start out as full vectors.
-define <4 x half> @fun1(<4 x half> %LHS, <4 x half> %RHS) {
-; VECTOR-LABEL: fun1:
-; VECTOR: # %bb.0:
-; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
-; VECTOR-NEXT: .cfi_offset %r14, -48
-; VECTOR-NEXT: .cfi_offset %r15, -40
-; VECTOR-NEXT: aghi %r15, -248
-; VECTOR-NEXT: .cfi_def_cfa_offset 408
-; VECTOR-NEXT: std %f8, 240(%r15) # 8-byte Spill
-; VECTOR-NEXT: .cfi_offset %f8, -168
-; VECTOR-NEXT: vst %v26, 176(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vst %v24, 160(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vreph %v0, %v26, 7
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldr %f8, %f0
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 7
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: sebr %f0, %f8
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 6
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldr %f8, %f0
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 6
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: sebr %f0, %f8
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vmrhh %v0, %v0, %v1
-; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 5
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldr %f8, %f0
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 5
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: sebr %f0, %f8
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 4
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldr %f8, %f0
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 4
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: sebr %f0, %f8
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vmrhh %v0, %v0, %v1
-; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vmrhf %v0, %v0, %v1
-; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 3
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldr %f8, %f0
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 3
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: sebr %f0, %f8
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 2
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldr %f8, %f0
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 2
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: sebr %f0, %f8
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vmrhh %v0, %v0, %v1
-; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldr %f8, %f0
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: sebr %f0, %f8
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vst %v0, 224(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 1
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldr %f8, %f0
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 1
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: sebr %f0, %f8
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: vl %v1, 224(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vmrhh %v0, %v1, %v0
-; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vmrhf %v0, %v0, %v1
-; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: ld %f8, 240(%r15) # 8-byte Reload
-; VECTOR-NEXT: vmrhg %v24, %v0, %v1
-; VECTOR-NEXT: lmg %r14, %r15, 360(%r15)
-; VECTOR-NEXT: br %r14
+%Ty1 = type <4 x half>
+define void @fun1(ptr %Src, ptr %Dst) {
+; CHECK-LABEL: fun1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
+; CHECK-NEXT: .cfi_offset %r13, -56
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -224
+; CHECK-NEXT: .cfi_def_cfa_offset 384
+; CHECK-NEXT: std %f8, 216(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f9, 208(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f10, 200(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f11, 192(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f12, 184(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f13, 176(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f14, 168(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f15, 160(%r15) # 8-byte Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: .cfi_offset %f9, -176
+; CHECK-NEXT: .cfi_offset %f10, -184
+; CHECK-NEXT: .cfi_offset %f11, -192
+; CHECK-NEXT: .cfi_offset %f12, -200
+; CHECK-NEXT: .cfi_offset %f13, -208
+; CHECK-NEXT: .cfi_offset %f14, -216
+; CHECK-NEXT: .cfi_offset %f15, -224
+; CHECK-NEXT: lgh %r0, 6(%r2)
+; CHECK-NEXT: lgr %r13, %r3
+; CHECK-NEXT: lgh %r1, 4(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f8, %r0
+; CHECK-NEXT: lgh %r0, 2(%r2)
+; CHECK-NEXT: sllg %r1, %r1, 48
+; CHECK-NEXT: ldgr %f9, %r1
+; CHECK-NEXT: lgh %r1, 0(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: lgh %r3, 14(%r2)
+; CHECK-NEXT: ldgr %f12, %r0
+; CHECK-NEXT: sllg %r0, %r1, 48
+; CHECK-NEXT: ldgr %f10, %r0
+; CHECK-NEXT: sllg %r0, %r3, 48
+; CHECK-NEXT: lgh %r1, 12(%r2)
+; CHECK-NEXT: ldgr %f11, %r0
+; CHECK-NEXT: lgh %r0, 10(%r2)
+; CHECK-NEXT: lgh %r2, 8(%r2)
+; CHECK-NEXT: sllg %r1, %r1, 48
+; CHECK-NEXT: ldgr %f13, %r1
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: sllg %r1, %r2, 48
+; CHECK-NEXT: ldgr %f0, %r1
+; CHECK-NEXT: ldgr %f14, %r0
+; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f15, %f0
+; CHECK-NEXT: ler %f0, %f10
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: sebr %f0, %f15
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ler %f10, %f0
+; CHECK-NEXT: ler %f0, %f14
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f14, %f0
+; CHECK-NEXT: ler %f0, %f12
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: sebr %f0, %f14
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ler %f12, %f0
+; CHECK-NEXT: ler %f0, %f13
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f13, %f0
+; CHECK-NEXT: ler %f0, %f9
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: sebr %f0, %f13
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ler %f9, %f0
+; CHECK-NEXT: ler %f0, %f11
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f11, %f0
+; CHECK-NEXT: ler %f0, %f8
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: sebr %f0, %f11
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d
+; CHECK-NEXT: lgdr %r0, %f0
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 6(%r13)
+; CHECK-NEXT: lgdr %r0, %f9
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 4(%r13)
+; CHECK-NEXT: lgdr %r0, %f12
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 2(%r13)
+; CHECK-NEXT: lgdr %r0, %f10
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 0(%r13)
+; CHECK-NEXT: ld %f8, 216(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f9, 208(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f10, 200(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f11, 192(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f12, 184(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f13, 176(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f14, 168(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f15, 160(%r15) # 8-byte Reload
+; CHECK-NEXT: lmg %r13, %r15, 328(%r15)
+; CHECK-NEXT: br %r14
;
-; SCALAR-LABEL: fun1:
-; SCALAR: # %bb.0:
-; SCALAR-NEXT: stmg %r14, %r15, 112(%r15)
-; SCALAR-NEXT: .cfi_offset %r14, -48
-; SCALAR-NEXT: .cfi_offset %r15, -40
-; SCALAR-NEXT: aghi %r15, -224
-; SCALAR-NEXT: .cfi_def_cfa_offset 384
-; SCALAR-NEXT: std %f8, 216(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f9, 208(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f10, 200(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f11, 192(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f12, 184(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f13, 176(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f14, 168(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f15, 160(%r15) # 8-byte Spill
-; SCALAR-NEXT: .cfi_offset %f8, -168
-; SCALAR-NEXT: .cfi_offset %f9, -176
-; SCALAR-NEXT: .cfi_offset %f10, -184
-; SCALAR-NEXT: .cfi_offset %f11, -192
-; SCALAR-NEXT: .cfi_offset %f12, -200
-; SCALAR-NEXT: .cfi_offset %f13, -208
-; SCALAR-NEXT: .cfi_offset %f14, -216
-; SCALAR-NEXT: .cfi_offset %f15, -224
-; SCALAR-NEXT: lgh %r0, 414(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f9, %r0
-; SCALAR-NEXT: lgh %r0, 406(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f13, %r0
-; SCALAR-NEXT: lgh %r0, 398(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f14, %r0
-; SCALAR-NEXT: lgh %r0, 390(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ler %f8, %f6
-; SCALAR-NEXT: ler %f10, %f4
-; SCALAR-NEXT: ler %f12, %f2
-; SCALAR-NEXT: ler %f11, %f0
-; SCALAR-NEXT: ldgr %f0, %r0
-; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ler %f15, %f0
-; SCALAR-NEXT: ler %f0, %f11
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: sebr %f0, %f15
-; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; SCALAR-NEXT: ler %f11, %f0
-; SCALAR-NEXT: ler %f0, %f14
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ler %f14, %f0
-; SCALAR-NEXT: ler %f0, %f12
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: sebr %f0, %f14
-; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; SCALAR-NEXT: ler %f12, %f0
-; SCALAR-NEXT: ler %f0, %f13
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ler %f13, %f0
-; SCALAR-NEXT: ler %f0, %f10
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: sebr %f0, %f13
-; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; SCALAR-NEXT: ler %f10, %f0
-; SCALAR-NEXT: ler %f0, %f9
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ler %f9, %f0
-; SCALAR-NEXT: ler %f0, %f8
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: sebr %f0, %f9
-; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; SCALAR-NEXT: ler %f6, %f0
-; SCALAR-NEXT: ler %f0, %f11
-; SCALAR-NEXT: ler %f2, %f12
-; SCALAR-NEXT: ler %f4, %f10
-; SCALAR-NEXT: ld %f8, 216(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f9, 208(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f10, 200(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f11, 192(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f12, 184(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f13, 176(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f14, 168(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f15, 160(%r15) # 8-byte Reload
-; SCALAR-NEXT: lmg %r14, %r15, 336(%r15)
-; SCALAR-NEXT: br %r14
- %Res = fsub <4 x half> %LHS, %RHS
- ret <4 x half> %Res
-}
-
-; Same, but the resulting v4f16 is stored instead and
-; SimplifyDemandedVectorElts() can remove the unneeded scalar operations.
-; (SCALAR_TO_VECTOR handling in combineExtract)
-define void @fun2(<4 x half> %LHS, <4 x half> %RHS, ptr %Dst) {
-; VECTOR-LABEL: fun2:
+; VECTOR-LABEL: fun1:
; VECTOR: # %bb.0:
; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
; VECTOR-NEXT: .cfi_offset %r13, -56
; VECTOR-NEXT: .cfi_offset %r14, -48
; VECTOR-NEXT: .cfi_offset %r15, -40
-; VECTOR-NEXT: aghi %r15, -232
-; VECTOR-NEXT: .cfi_def_cfa_offset 392
-; VECTOR-NEXT: std %f8, 224(%r15) # 8-byte Spill
+; VECTOR-NEXT: aghi %r15, -224
+; VECTOR-NEXT: .cfi_def_cfa_offset 384
+; VECTOR-NEXT: std %f8, 216(%r15) # 8-byte Spill
+; VECTOR-NEXT: std %f9, 208(%r15) # 8-byte Spill
+; VECTOR-NEXT: std %f10, 200(%r15) # 8-byte Spill
+; VECTOR-NEXT: std %f11, 192(%r15) # 8-byte Spill
+; VECTOR-NEXT: std %f12, 184(%r15) # 8-byte Spill
+; VECTOR-NEXT: std %f13, 176(%r15) # 8-byte Spill
+; VECTOR-NEXT: std %f14, 168(%r15) # 8-byte Spill
+; VECTOR-NEXT: std %f15, 160(%r15) # 8-byte Spill
; VECTOR-NEXT: .cfi_offset %f8, -168
-; VECTOR-NEXT: lgr %r13, %r2
-; VECTOR-NEXT: vst %v26, 192(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vst %v24, 160(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vreph %v0, %v26, 3
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldr %f8, %f0
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 3
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: sebr %f0, %f8
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: .cfi_offset %f10, -184
+; VECTOR-NEXT: .cfi_offset %f11, -192
+; VECTOR-NEXT: .cfi_offset %f12, -200
+; VECTOR-NEXT: .cfi_offset %f13, -208
+; VECTOR-NEXT: .cfi_offset %f14, -216
+; VECTOR-NEXT: .cfi_offset %f15, -224
+; VECTOR-NEXT: vlreph %v0, 8(%r2)
+; VECTOR-NEXT: vlreph %v8, 6(%r2)
+; VECTOR-NEXT: vlreph %v9, 4(%r2)
+; VECTOR-NEXT: vlreph %v10, 2(%r2)
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: vlreph %v11, 0(%r2)
+; VECTOR-NEXT: vlreph %v12, 14(%r2)
+; VECTOR-NEXT: vlreph %v13, 12(%r2)
+; VECTOR-NEXT: vlreph %v14, 10(%r2)
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f15, %f0
+; VECTOR-NEXT: ldr %f0, %f11
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: sebr %f0, %f15
; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 2
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: ldr %f11, %f0
+; VECTOR-NEXT: ldr %f0, %f14
; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldr %f8, %f0
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 2
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: ldr %f14, %f0
+; VECTOR-NEXT: ldr %f0, %f10
; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: sebr %f0, %f8
+; VECTOR-NEXT: sebr %f0, %f14
; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vmrhh %v0, %v0, %v1
-; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: ldr %f10, %f0
+; VECTOR-NEXT: ldr %f0, %f13
; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldr %f8, %f0
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: ldr %f13, %f0
+; VECTOR-NEXT: ldr %f0, %f9
; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: sebr %f0, %f8
+; VECTOR-NEXT: sebr %f0, %f13
; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 1
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f12
; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldr %f8, %f0
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 1
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: ldr %f12, %f0
+; VECTOR-NEXT: ldr %f0, %f8
; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: sebr %f0, %f8
+; VECTOR-NEXT: sebr %f0, %f12
; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vmrhh %v0, %v1, %v0
-; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vmrhf %v0, %v0, %v1
-; VECTOR-NEXT: vmrhf %v1, %v0, %v0
-; VECTOR-NEXT: ld %f8, 224(%r15) # 8-byte Reload
-; VECTOR-NEXT: vmrhg %v0, %v0, %v1
-; VECTOR-NEXT: vsteg %v0, 0(%r13), 0
-; VECTOR-NEXT: lmg %r13, %r15, 336(%r15)
+; VECTOR-NEXT: vsteh %v9, 4(%r13), 0
+; VECTOR-NEXT: vsteh %v10, 2(%r13), 0
+; VECTOR-NEXT: vsteh %v11, 0(%r13), 0
+; VECTOR-NEXT: ld %f8, 216(%r15) # 8-byte Reload
+; VECTOR-NEXT: ld %f9, 208(%r15) # 8-byte Reload
+; VECTOR-NEXT: ld %f10, 200(%r15) # 8-byte Reload
+; VECTOR-NEXT: ld %f11, 192(%r15) # 8-byte Reload
+; VECTOR-NEXT: ld %f12, 184(%r15) # 8-byte Reload
+; VECTOR-NEXT: ld %f13, 176(%r15) # 8-byte Reload
+; VECTOR-NEXT: ld %f14, 168(%r15) # 8-byte Reload
+; VECTOR-NEXT: ld %f15, 160(%r15) # 8-byte Reload
+; VECTOR-NEXT: vsteh %v0, 6(%r13), 0
+; VECTOR-NEXT: lmg %r13, %r15, 328(%r15)
; VECTOR-NEXT: br %r14
-;
-; SCALAR-LABEL: fun2:
-; SCALAR: # %bb.0:
-; SCALAR-NEXT: stmg %r13, %r15, 104(%r15)
-; SCALAR-NEXT: .cfi_offset %r13, -56
-; SCALAR-NEXT: .cfi_offset %r14, -48
-; SCALAR-NEXT: .cfi_offset %r15, -40
-; SCALAR-NEXT: aghi %r15, -224
-; SCALAR-NEXT: .cfi_def_cfa_offset 384
-; SCALAR-NEXT: std %f8, 216(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f9, 208(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f10, 200(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f11, 192(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f12, 184(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f13, 176(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f14, 168(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f15, 160(%r15) # 8-byte Spill
-; SCALAR-NEXT: .cfi_offset %f8, -168
-; SCALAR-NEXT: .cfi_offset %f9, -176
-; SCALAR-NEXT: .cfi_offset %f10, -184
-; SCALAR-NEXT: .cfi_offset %f11, -192
-; SCALAR-NEXT: .cfi_offset %f12, -200
-; SCALAR-NEXT: .cfi_offset %f13, -208
-; SCALAR-NEXT: .cfi_offset %f14, -216
-; SCALAR-NEXT: .cfi_offset %f15, -224
-; SCALAR-NEXT: lgh %r0, 414(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f10, %r0
-; SCALAR-NEXT: lgh %r0, 406(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f13, %r0
-; SCALAR-NEXT: lgh %r0, 398(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f14, %r0
-; SCALAR-NEXT: lgh %r0, 390(%r15)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: lgr %r13, %r2
-; SCALAR-NEXT: ler %f8, %f6
-; SCALAR-NEXT: ler %f11, %f4
-; SCALAR-NEXT: ler %f12, %f2
-; SCALAR-NEXT: ler %f9, %f0
-; SCALAR-NEXT: ldgr %f0, %r0
-; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ler %f15, %f0
-; SCALAR-NEXT: ler %f0, %f9
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: sebr %f0, %f15
-; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; SCALAR-NEXT: ler %f9, %f0
-; SCALAR-NEXT: ler %f0, %f14
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ler %f14, %f0
-; SCALAR-NEXT: ler %f0, %f12
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: sebr %f0, %f14
-; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; SCALAR-NEXT: ler %f12, %f0
-; SCALAR-NEXT: ler %f0, %f13
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ler %f13, %f0
-; SCALAR-NEXT: ler %f0, %f11
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: sebr %f0, %f13
-; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; SCALAR-NEXT: ler %f11, %f0
-; SCALAR-NEXT: ler %f0, %f10
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ler %f10, %f0
-; SCALAR-NEXT: ler %f0, %f8
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: sebr %f0, %f10
-; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; SCALAR-NEXT: # kill: def $f0h killed $f0h def $f0d
-; SCALAR-NEXT: lgdr %r0, %f0
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 6(%r13)
-; SCALAR-NEXT: lgdr %r0, %f11
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 4(%r13)
-; SCALAR-NEXT: lgdr %r0, %f12
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 2(%r13)
-; SCALAR-NEXT: lgdr %r0, %f9
-; SCALAR-NEXT: ld %f8, 216(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f9, 208(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f10, 200(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f11, 192(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f12, 184(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f13, 176(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f14, 168(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f15, 160(%r15) # 8-byte Reload
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 0(%r13)
-; SCALAR-NEXT: lmg %r13, %r15, 328(%r15)
-; SCALAR-NEXT: br %r14
- %Res = fsub <4 x half> %LHS, %RHS
- store <4 x half> %Res, ptr %Dst
+ %LHS = load %Ty1, ptr %Src
+ %S2 = getelementptr %Ty1, ptr %Src, i32 1
+ %RHS = load %Ty1, ptr %S2
+ %Res = fsub %Ty1 %LHS, %RHS
+ store %Ty1 %Res, ptr %Dst
ret void
}
-
-; The handling in combineExtract() works, but due to the order DAGCombiner
-; revisits nodes and users, the fsubs are replaced with NaNs instead of
-; Undefs (see comment in foldConstantFPMath()). Thus the vrepih below.
-define <4 x half> @fun3(ptr %Src, ptr %Dst) {
-; VECTOR-LABEL: fun3:
-; VECTOR: # %bb.0:
-; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
-; VECTOR-NEXT: .cfi_offset %r14, -48
-; VECTOR-NEXT: .cfi_offset %r15, -40
-; VECTOR-NEXT: aghi %r15, -232
-; VECTOR-NEXT: .cfi_def_cfa_offset 392
-; VECTOR-NEXT: std %f8, 224(%r15) # 8-byte Spill
-; VECTOR-NEXT: .cfi_offset %f8, -168
-; VECTOR-NEXT: vlrepg %v0, 0(%r2)
-; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vlrepg %v0, 8(%r2)
-; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vreph %v0, %v0, 3
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldr %f8, %f0
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 3
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: sebr %f0, %f8
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 2
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldr %f8, %f0
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 2
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: sebr %f0, %f8
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vmrhh %v0, %v0, %v1
-; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldr %f8, %f0
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: sebr %f0, %f8
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 1
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldr %f8, %f0
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 1
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: sebr %f0, %f8
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vmrhh %v0, %v1, %v0
-; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vmrhf %v0, %v0, %v1
-; VECTOR-NEXT: vrepih %v1, 32256
-; VECTOR-NEXT: vmrhh %v1, %v1, %v1
-; VECTOR-NEXT: ld %f8, 224(%r15) # 8-byte Reload
-; VECTOR-NEXT: vmrhf %v1, %v1, %v1
-; VECTOR-NEXT: vmrhg %v24, %v0, %v1
-; VECTOR-NEXT: lmg %r14, %r15, 344(%r15)
-; VECTOR-NEXT: br %r14
-;
-; SCALAR-LABEL: fun3:
-; SCALAR: # %bb.0:
-; SCALAR-NEXT: stmg %r14, %r15, 112(%r15)
-; SCALAR-NEXT: .cfi_offset %r14, -48
-; SCALAR-NEXT: .cfi_offset %r15, -40
-; SCALAR-NEXT: aghi %r15, -224
-; SCALAR-NEXT: .cfi_def_cfa_offset 384
-; SCALAR-NEXT: std %f8, 216(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f9, 208(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f10, 200(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f11, 192(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f12, 184(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f13, 176(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f14, 168(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f15, 160(%r15) # 8-byte Spill
-; SCALAR-NEXT: .cfi_offset %f8, -168
-; SCALAR-NEXT: .cfi_offset %f9, -176
-; SCALAR-NEXT: .cfi_offset %f10, -184
-; SCALAR-NEXT: .cfi_offset %f11, -192
-; SCALAR-NEXT: .cfi_offset %f12, -200
-; SCALAR-NEXT: .cfi_offset %f13, -208
-; SCALAR-NEXT: .cfi_offset %f14, -216
-; SCALAR-NEXT: .cfi_offset %f15, -224
-; SCALAR-NEXT: lgh %r0, 6(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f8, %r0
-; SCALAR-NEXT: lgh %r0, 4(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f9, %r0
-; SCALAR-NEXT: lgh %r0, 2(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f11, %r0
-; SCALAR-NEXT: lgh %r0, 0(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f12, %r0
-; SCALAR-NEXT: lgh %r0, 14(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f10, %r0
-; SCALAR-NEXT: lgh %r0, 12(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f13, %r0
-; SCALAR-NEXT: lgh %r0, 10(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f14, %r0
-; SCALAR-NEXT: lgh %r0, 8(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f0, %r0
-; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ler %f15, %f0
-; SCALAR-NEXT: ler %f0, %f12
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: sebr %f0, %f15
-; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; SCALAR-NEXT: ler %f12, %f0
-; SCALAR-NEXT: ler %f0, %f14
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ler %f14, %f0
-; SCALAR-NEXT: ler %f0, %f11
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: sebr %f0, %f14
-; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; SCALAR-NEXT: ler %f11, %f0
-; SCALAR-NEXT: ler %f0, %f13
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ler %f13, %f0
-; SCALAR-NEXT: ler %f0, %f9
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: sebr %f0, %f13
-; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; SCALAR-NEXT: ler %f9, %f0
-; SCALAR-NEXT: ler %f0, %f10
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ler %f10, %f0
-; SCALAR-NEXT: ler %f0, %f8
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: sebr %f0, %f10
-; SCALAR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; SCALAR-NEXT: ler %f6, %f0
-; SCALAR-NEXT: ler %f0, %f12
-; SCALAR-NEXT: ler %f2, %f11
-; SCALAR-NEXT: ler %f4, %f9
-; SCALAR-NEXT: ld %f8, 216(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f9, 208(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f10, 200(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f11, 192(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f12, 184(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f13, 176(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f14, 168(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f15, 160(%r15) # 8-byte Reload
-; SCALAR-NEXT: lmg %r14, %r15, 336(%r15)
-; SCALAR-NEXT: br %r14
- %L0 = load <4 x half>, ptr %Src
- %Ptr1 = getelementptr <4 x half>, ptr %Src, i64 1
- %L1 = load <4 x half>, ptr %Ptr1
- %Res = fsub <4 x half> %L0, %L1
- ret <4 x half> %Res
-}
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-conv.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-conv.ll
new file mode 100644
index 0000000000000..2f1872fe1ac84
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-conv.ll
@@ -0,0 +1,178 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s --check-prefix=VECTOR
+;
+; Test conversions between different-sized float elements.
+
+; Test cases where both elements of a v2f64 are converted to f16s.
+define void @f1(<2 x double> %val, ptr %ptr) {
+; CHECK-LABEL: f1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
+; CHECK-NEXT: .cfi_offset %r13, -56
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -176
+; CHECK-NEXT: .cfi_def_cfa_offset 336
+; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: .cfi_offset %f9, -176
+; CHECK-NEXT: lgr %r13, %r2
+; CHECK-NEXT: ldr %f8, %f2
+; CHECK-NEXT: brasl %r14, __truncdfhf2 at PLT
+; CHECK-NEXT: ler %f9, %f0
+; CHECK-NEXT: ldr %f0, %f8
+; CHECK-NEXT: brasl %r14, __truncdfhf2 at PLT
+; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d
+; CHECK-NEXT: lgdr %r0, %f0
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 2(%r13)
+; CHECK-NEXT: lgdr %r0, %f9
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 0(%r13)
+; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Reload
+; CHECK-NEXT: lmg %r13, %r15, 280(%r15)
+; CHECK-NEXT: br %r14
+;
+; VECTOR-LABEL: f1:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -184
+; VECTOR-NEXT: .cfi_def_cfa_offset 344
+; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: lgr %r13, %r2
+; VECTOR-NEXT: vst %v24, 160(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vrepg %v0, %v24, 1
+; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0
+; VECTOR-NEXT: brasl %r14, __truncdfhf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0
+; VECTOR-NEXT: brasl %r14, __truncdfhf2 at PLT
+; VECTOR-NEXT: vsteh %v8, 2(%r13), 0
+; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Reload
+; VECTOR-NEXT: vsteh %v0, 0(%r13), 0
+; VECTOR-NEXT: lmg %r13, %r15, 288(%r15)
+; VECTOR-NEXT: br %r14
+ %res = fptrunc <2 x double> %val to <2 x half>
+ store <2 x half> %res, ptr %ptr
+ ret void
+}
+
+; Test conversion of an f64 in a vector register to an f16.
+define half @f2(<2 x double> %vec) {
+; CHECK-LABEL: f2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: brasl %r14, __truncdfhf2 at PLT
+; CHECK-NEXT: lmg %r14, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+;
+; VECTOR-LABEL: f2:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: vlr %v0, %v24
+; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0
+; VECTOR-NEXT: brasl %r14, __truncdfhf2 at PLT
+; VECTOR-NEXT: lmg %r14, %r15, 272(%r15)
+; VECTOR-NEXT: br %r14
+ %scalar = extractelement <2 x double> %vec, i32 0
+ %ret = fptrunc double %scalar to half
+ ret half %ret
+}
+
+; Test cases where even elements of a v4f16 are converted to f64s.
+define <2 x double> @f3(<4 x half> %vec) {
+; CHECK-LABEL: f3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -176
+; CHECK-NEXT: .cfi_def_cfa_offset 336
+; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: .cfi_offset %f9, -176
+; CHECK-NEXT: ler %f8, %f4
+; CHECK-NEXT: brasl %r14, __extendhfdf2 at PLT
+; CHECK-NEXT: ldr %f9, %f0
+; CHECK-NEXT: ler %f0, %f8
+; CHECK-NEXT: brasl %r14, __extendhfdf2 at PLT
+; CHECK-NEXT: ldr %f2, %f0
+; CHECK-NEXT: ldr %f0, %f9
+; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Reload
+; CHECK-NEXT: lmg %r14, %r15, 288(%r15)
+; CHECK-NEXT: br %r14
+;
+; VECTOR-LABEL: f3:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -192
+; VECTOR-NEXT: .cfi_def_cfa_offset 352
+; VECTOR-NEXT: vreph %v1, %v24, 2
+; VECTOR-NEXT: vlr %v0, %v24
+; VECTOR-NEXT: vst %v1, 176(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfdf2 at PLT
+; VECTOR-NEXT: # kill: def $f0d killed $f0d def $v0
+; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfdf2 at PLT
+; VECTOR-NEXT: vl %v1, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0d killed $f0d def $v0
+; VECTOR-NEXT: vmrhg %v24, %v1, %v0
+; VECTOR-NEXT: lmg %r14, %r15, 304(%r15)
+; VECTOR-NEXT: br %r14
+ %shuffle = shufflevector <4 x half> %vec, <4 x half> undef, <2 x i32> <i32 0, i32 2>
+ %res = fpext <2 x half> %shuffle to <2 x double>
+ ret <2 x double> %res
+}
+
+; Test conversion of an f16 in a vector register to an f32.
+define float @f4(<4 x half> %vec) {
+; CHECK-LABEL: f4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: lmg %r14, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+;
+; VECTOR-LABEL: f4:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: vlr %v0, %v24
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: lmg %r14, %r15, 272(%r15)
+; VECTOR-NEXT: br %r14
+ %scalar = extractelement <4 x half> %vec, i32 0
+ %ret = fpext half %scalar to float
+ ret float %ret
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-conversions.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-conversions.ll
deleted file mode 100644
index 9f926c0e640b6..0000000000000
--- a/llvm/test/CodeGen/SystemZ/fp-half-vector-conversions.ll
+++ /dev/null
@@ -1,2 +0,0 @@
-; TODO:
-; bitconvert, SCALAR_TO_VECTOR, merge-high, merge-low
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-fcmp-select.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-fcmp-select.ll
new file mode 100644
index 0000000000000..0500f43b7f33e
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-fcmp-select.ll
@@ -0,0 +1,503 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s --check-prefix=VECTOR
+;
+; Test fcmp and select with fp16 vectors.
+
+; Use of vsel with full vector.
+%Ty0 = type <8 x half>
+define void @fun0(ptr %Src, ptr %Dst) {
+; CHECK-LABEL: fun0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r6, %r15, 48(%r15)
+; CHECK-NEXT: .cfi_offset %r6, -112
+; CHECK-NEXT: .cfi_offset %r7, -104
+; CHECK-NEXT: .cfi_offset %r8, -96
+; CHECK-NEXT: .cfi_offset %r9, -88
+; CHECK-NEXT: .cfi_offset %r10, -80
+; CHECK-NEXT: .cfi_offset %r11, -72
+; CHECK-NEXT: .cfi_offset %r12, -64
+; CHECK-NEXT: .cfi_offset %r13, -56
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -272
+; CHECK-NEXT: .cfi_def_cfa_offset 432
+; CHECK-NEXT: std %f8, 264(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f9, 256(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f10, 248(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f11, 240(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f12, 232(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f13, 224(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f14, 216(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f15, 208(%r15) # 8-byte Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: .cfi_offset %f9, -176
+; CHECK-NEXT: .cfi_offset %f10, -184
+; CHECK-NEXT: .cfi_offset %f11, -192
+; CHECK-NEXT: .cfi_offset %f12, -200
+; CHECK-NEXT: .cfi_offset %f13, -208
+; CHECK-NEXT: .cfi_offset %f14, -216
+; CHECK-NEXT: .cfi_offset %f15, -224
+; CHECK-NEXT: lgh %r0, 14(%r2)
+; CHECK-NEXT: stg %r0, 200(%r15) # 8-byte Spill
+; CHECK-NEXT: lgh %r0, 12(%r2)
+; CHECK-NEXT: stg %r0, 160(%r15) # 8-byte Spill
+; CHECK-NEXT: lgh %r0, 6(%r2)
+; CHECK-NEXT: sllg %r12, %r0, 48
+; CHECK-NEXT: lgh %r0, 4(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f10, %r0
+; CHECK-NEXT: lgh %r0, 2(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f9, %r0
+; CHECK-NEXT: lgh %r0, 0(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f12, %r0
+; CHECK-NEXT: lgh %r0, 30(%r2)
+; CHECK-NEXT: stg %r0, 192(%r15) # 8-byte Spill
+; CHECK-NEXT: lgh %r0, 28(%r2)
+; CHECK-NEXT: stg %r0, 184(%r15) # 8-byte Spill
+; CHECK-NEXT: lgh %r0, 22(%r2)
+; CHECK-NEXT: sllg %r10, %r0, 48
+; CHECK-NEXT: lgh %r0, 20(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f13, %r0
+; CHECK-NEXT: lgh %r0, 18(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f8, %r0
+; CHECK-NEXT: lgh %r0, 16(%r2)
+; CHECK-NEXT: lgh %r8, 10(%r2)
+; CHECK-NEXT: lgh %r6, 8(%r2)
+; CHECK-NEXT: lgh %r7, 26(%r2)
+; CHECK-NEXT: lgh %r11, 24(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: lgr %r13, %r3
+; CHECK-NEXT: ldgr %f0, %r0
+; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f11, %f0
+; CHECK-NEXT: ler %f0, %f12
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: cebr %f0, %f11
+; CHECK-NEXT: je .LBB0_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ler %f0, %f11
+; CHECK-NEXT: .LBB0_2:
+; CHECK-NEXT: sllg %r6, %r6, 48
+; CHECK-NEXT: sllg %r9, %r11, 48
+; CHECK-NEXT: ldgr %f11, %r12
+; CHECK-NEXT: ldgr %f15, %r10
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d
+; CHECK-NEXT: std %f0, 176(%r15) # 8-byte Spill
+; CHECK-NEXT: ler %f0, %f8
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f8, %f0
+; CHECK-NEXT: ler %f0, %f9
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: cebr %f0, %f8
+; CHECK-NEXT: je .LBB0_4
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: ler %f0, %f8
+; CHECK-NEXT: .LBB0_4:
+; CHECK-NEXT: sllg %r11, %r8, 48
+; CHECK-NEXT: sllg %r8, %r7, 48
+; CHECK-NEXT: ldgr %f12, %r6
+; CHECK-NEXT: ldgr %f14, %r9
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d
+; CHECK-NEXT: std %f0, 168(%r15) # 8-byte Spill
+; CHECK-NEXT: ler %f0, %f13
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f8, %f0
+; CHECK-NEXT: ler %f0, %f10
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: cebr %f0, %f8
+; CHECK-NEXT: je .LBB0_6
+; CHECK-NEXT: # %bb.5:
+; CHECK-NEXT: ler %f0, %f8
+; CHECK-NEXT: .LBB0_6:
+; CHECK-NEXT: lg %r0, 160(%r15) # 8-byte Reload
+; CHECK-NEXT: sllg %r12, %r0, 48
+; CHECK-NEXT: lg %r0, 184(%r15) # 8-byte Reload
+; CHECK-NEXT: sllg %r10, %r0, 48
+; CHECK-NEXT: ldgr %f13, %r11
+; CHECK-NEXT: ldgr %f8, %r8
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d
+; CHECK-NEXT: std %f0, 160(%r15) # 8-byte Spill
+; CHECK-NEXT: ler %f0, %f15
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f9, %f0
+; CHECK-NEXT: ler %f0, %f11
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: cebr %f0, %f9
+; CHECK-NEXT: je .LBB0_8
+; CHECK-NEXT: # %bb.7:
+; CHECK-NEXT: ler %f0, %f9
+; CHECK-NEXT: .LBB0_8:
+; CHECK-NEXT: lg %r0, 200(%r15) # 8-byte Reload
+; CHECK-NEXT: sllg %r11, %r0, 48
+; CHECK-NEXT: lg %r0, 192(%r15) # 8-byte Reload
+; CHECK-NEXT: sllg %r9, %r0, 48
+; CHECK-NEXT: ldgr %f15, %r12
+; CHECK-NEXT: ldgr %f9, %r10
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ler %f11, %f0
+; CHECK-NEXT: ler %f0, %f14
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f10, %f0
+; CHECK-NEXT: ler %f0, %f12
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: cebr %f0, %f10
+; CHECK-NEXT: je .LBB0_10
+; CHECK-NEXT: # %bb.9:
+; CHECK-NEXT: ler %f0, %f10
+; CHECK-NEXT: .LBB0_10:
+; CHECK-NEXT: ldgr %f14, %r11
+; CHECK-NEXT: ldgr %f10, %r9
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ler %f12, %f0
+; CHECK-NEXT: ler %f0, %f8
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f8, %f0
+; CHECK-NEXT: ler %f0, %f13
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: cebr %f0, %f8
+; CHECK-NEXT: je .LBB0_12
+; CHECK-NEXT: # %bb.11:
+; CHECK-NEXT: ler %f0, %f8
+; CHECK-NEXT: .LBB0_12:
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ler %f8, %f0
+; CHECK-NEXT: ler %f0, %f9
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f9, %f0
+; CHECK-NEXT: ler %f0, %f15
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: cebr %f0, %f9
+; CHECK-NEXT: je .LBB0_14
+; CHECK-NEXT: # %bb.13:
+; CHECK-NEXT: ler %f0, %f9
+; CHECK-NEXT: .LBB0_14:
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ler %f9, %f0
+; CHECK-NEXT: ler %f0, %f10
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f10, %f0
+; CHECK-NEXT: ler %f0, %f14
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: cebr %f0, %f10
+; CHECK-NEXT: je .LBB0_16
+; CHECK-NEXT: # %bb.15:
+; CHECK-NEXT: ler %f0, %f10
+; CHECK-NEXT: .LBB0_16:
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d
+; CHECK-NEXT: lgdr %r0, %f0
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 14(%r13)
+; CHECK-NEXT: lgdr %r0, %f9
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 12(%r13)
+; CHECK-NEXT: lgdr %r0, %f8
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 10(%r13)
+; CHECK-NEXT: lgdr %r0, %f12
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 8(%r13)
+; CHECK-NEXT: lgdr %r0, %f11
+; CHECK-NEXT: ld %f8, 264(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f9, 256(%r15) # 8-byte Reload
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: ld %f10, 248(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f11, 240(%r15) # 8-byte Reload
+; CHECK-NEXT: sth %r0, 6(%r13)
+; CHECK-NEXT: lg %r0, 160(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f12, 232(%r15) # 8-byte Reload
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: ld %f13, 224(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f14, 216(%r15) # 8-byte Reload
+; CHECK-NEXT: sth %r0, 4(%r13)
+; CHECK-NEXT: lg %r0, 168(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f15, 208(%r15) # 8-byte Reload
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 2(%r13)
+; CHECK-NEXT: lg %r0, 176(%r15) # 8-byte Reload
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 0(%r13)
+; CHECK-NEXT: lmg %r6, %r15, 320(%r15)
+; CHECK-NEXT: br %r14
+;
+; VECTOR-LABEL: fun0:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r11, %r15, 88(%r15)
+; VECTOR-NEXT: .cfi_offset %r11, -72
+; VECTOR-NEXT: .cfi_offset %r12, -64
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -216
+; VECTOR-NEXT: .cfi_def_cfa_offset 376
+; VECTOR-NEXT: std %f8, 208(%r15) # 8-byte Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: vl %v0, 16(%r2), 3
+; VECTOR-NEXT: mvc 176(16,%r15), 0(%r2) # 16-byte Folded Spill
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vreph %v0, %v0, 7
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 7
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: cebr %f0, %f8
+; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: lhi %r11, 0
+; VECTOR-NEXT: lhi %r12, 0
+; VECTOR-NEXT: lochie %r11, -1
+; VECTOR-NEXT: vreph %v0, %v0, 3
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 3
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: cebr %f0, %f8
+; VECTOR-NEXT: lhi %r0, 0
+; VECTOR-NEXT: lochie %r0, -1
+; VECTOR-NEXT: vlvgp %v0, %r0, %r11
+; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: cebr %f0, %f8
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: lhi %r0, 0
+; VECTOR-NEXT: lochie %r0, -1
+; VECTOR-NEXT: vlvgh %v0, %r0, 0
+; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 1
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 1
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: cebr %f0, %f8
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: lhi %r0, 0
+; VECTOR-NEXT: lochie %r0, -1
+; VECTOR-NEXT: vlvgh %v0, %r0, 1
+; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 2
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 2
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: cebr %f0, %f8
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: lhi %r0, 0
+; VECTOR-NEXT: lochie %r0, -1
+; VECTOR-NEXT: vlvgh %v0, %r0, 2
+; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 4
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 4
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: cebr %f0, %f8
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: lhi %r0, 0
+; VECTOR-NEXT: lochie %r0, -1
+; VECTOR-NEXT: vlvgh %v0, %r0, 4
+; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 5
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 5
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: cebr %f0, %f8
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: lhi %r0, 0
+; VECTOR-NEXT: lochie %r0, -1
+; VECTOR-NEXT: vlvgh %v0, %r0, 5
+; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill
+; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 6
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vreph %v0, %v0, 6
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: cebr %f0, %f8
+; VECTOR-NEXT: vl %v2, 160(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
+; VECTOR-NEXT: lochie %r12, -1
+; VECTOR-NEXT: vlvgh %v2, %r12, 6
+; VECTOR-NEXT: ld %f8, 208(%r15) # 8-byte Reload
+; VECTOR-NEXT: vsel %v0, %v0, %v1, %v2
+; VECTOR-NEXT: vst %v0, 0(%r13), 3
+; VECTOR-NEXT: lmg %r11, %r15, 304(%r15)
+; VECTOR-NEXT: br %r14
+ %A = load %Ty0, ptr %Src
+ %S2 = getelementptr %Ty0, ptr %Src, i32 1
+ %B = load %Ty0, ptr %S2
+ %C = fcmp oeq %Ty0 %A, %B
+ %S = select <8 x i1> %C, %Ty0 %A, %Ty0 %B
+ store %Ty0 %S, ptr %Dst
+ ret void
+}
+
+%Ty1 = type <2 x half>
+define void @fun1(ptr %Src, ptr %Dst) {
+; CHECK-LABEL: fun1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
+; CHECK-NEXT: .cfi_offset %r13, -56
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -192
+; CHECK-NEXT: .cfi_def_cfa_offset 352
+; CHECK-NEXT: std %f8, 184(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f9, 176(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f10, 168(%r15) # 8-byte Spill
+; CHECK-NEXT: std %f11, 160(%r15) # 8-byte Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: .cfi_offset %f9, -176
+; CHECK-NEXT: .cfi_offset %f10, -184
+; CHECK-NEXT: .cfi_offset %f11, -192
+; CHECK-NEXT: lgh %r0, 2(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f8, %r0
+; CHECK-NEXT: lgh %r0, 0(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f11, %r0
+; CHECK-NEXT: lgh %r0, 6(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f10, %r0
+; CHECK-NEXT: lgh %r0, 4(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: lgr %r13, %r3
+; CHECK-NEXT: ldgr %f0, %r0
+; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f9, %f0
+; CHECK-NEXT: ler %f0, %f11
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: cebr %f0, %f9
+; CHECK-NEXT: je .LBB1_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ler %f0, %f9
+; CHECK-NEXT: .LBB1_2:
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ler %f9, %f0
+; CHECK-NEXT: ler %f0, %f10
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f10, %f0
+; CHECK-NEXT: ler %f0, %f8
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: cebr %f0, %f10
+; CHECK-NEXT: je .LBB1_4
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: ler %f0, %f10
+; CHECK-NEXT: .LBB1_4:
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d
+; CHECK-NEXT: lgdr %r0, %f0
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 2(%r13)
+; CHECK-NEXT: lgdr %r0, %f9
+; CHECK-NEXT: ld %f8, 184(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f9, 176(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f10, 168(%r15) # 8-byte Reload
+; CHECK-NEXT: ld %f11, 160(%r15) # 8-byte Reload
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 0(%r13)
+; CHECK-NEXT: lmg %r13, %r15, 296(%r15)
+; CHECK-NEXT: br %r14
+;
+; VECTOR-LABEL: fun1:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -192
+; VECTOR-NEXT: .cfi_def_cfa_offset 352
+; VECTOR-NEXT: std %f8, 184(%r15) # 8-byte Spill
+; VECTOR-NEXT: std %f9, 176(%r15) # 8-byte Spill
+; VECTOR-NEXT: std %f10, 168(%r15) # 8-byte Spill
+; VECTOR-NEXT: std %f11, 160(%r15) # 8-byte Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: .cfi_offset %f10, -184
+; VECTOR-NEXT: .cfi_offset %f11, -192
+; VECTOR-NEXT: vlreph %v0, 4(%r2)
+; VECTOR-NEXT: vlreph %v8, 2(%r2)
+; VECTOR-NEXT: vlreph %v11, 0(%r2)
+; VECTOR-NEXT: vlreph %v9, 6(%r2)
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f10, %f0
+; VECTOR-NEXT: ldr %f0, %f11
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: cebr %f0, %f10
+; VECTOR-NEXT: je .LBB1_2
+; VECTOR-NEXT: # %bb.1:
+; VECTOR-NEXT: ldr %f0, %f10
+; VECTOR-NEXT: .LBB1_2:
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f10, %f0
+; VECTOR-NEXT: ldr %f0, %f9
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: cebr %f0, %f9
+; VECTOR-NEXT: je .LBB1_4
+; VECTOR-NEXT: # %bb.3:
+; VECTOR-NEXT: ldr %f0, %f9
+; VECTOR-NEXT: .LBB1_4:
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: vsteh %v10, 0(%r13), 0
+; VECTOR-NEXT: ld %f8, 184(%r15) # 8-byte Reload
+; VECTOR-NEXT: ld %f9, 176(%r15) # 8-byte Reload
+; VECTOR-NEXT: ld %f10, 168(%r15) # 8-byte Reload
+; VECTOR-NEXT: ld %f11, 160(%r15) # 8-byte Reload
+; VECTOR-NEXT: vsteh %v0, 2(%r13), 0
+; VECTOR-NEXT: lmg %r13, %r15, 296(%r15)
+; VECTOR-NEXT: br %r14
+ %A = load %Ty1, ptr %Src
+ %S2 = getelementptr %Ty1, ptr %Src, i32 1
+ %B = load %Ty1, ptr %S2
+ %C = fcmp oeq %Ty1 %A, %B
+ %S = select <2 x i1> %C, %Ty1 %A, %Ty1 %B
+ store %Ty1 %S, ptr %Dst
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-fcmp-vsel.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-fcmp-vsel.ll
deleted file mode 100644
index b7dbaea2188c4..0000000000000
--- a/llvm/test/CodeGen/SystemZ/fp-half-vector-fcmp-vsel.ll
+++ /dev/null
@@ -1,118 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
-; RUN: | FileCheck %s --check-prefix=VECTOR
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \
-; RUN: | FileCheck %s --check-prefix=SCALAR
-
-define <4 x i1> @fun0(ptr %Src) {
-; VECTOR-LABEL: fun0:
-; VECTOR: # %bb.0:
-; VECTOR-NEXT: stmg %r12, %r15, 96(%r15)
-; VECTOR-NEXT: .cfi_offset %r12, -64
-; VECTOR-NEXT: .cfi_offset %r13, -56
-; VECTOR-NEXT: .cfi_offset %r14, -48
-; VECTOR-NEXT: .cfi_offset %r15, -40
-; VECTOR-NEXT: aghi %r15, -192
-; VECTOR-NEXT: .cfi_def_cfa_offset 352
-; VECTOR-NEXT: vlrepg %v0, 0(%r2)
-; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vreph %v0, %v0, 3
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ltebr %f0, %f0
-; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: lhi %r12, 0
-; VECTOR-NEXT: lhi %r13, 0
-; VECTOR-NEXT: lochie %r12, -1
-; VECTOR-NEXT: vreph %v0, %v0, 1
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ltebr %f0, %f0
-; VECTOR-NEXT: lhi %r0, 0
-; VECTOR-NEXT: lochie %r0, -1
-; VECTOR-NEXT: vlvgp %v0, %r0, %r12
-; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ltebr %f0, %f0
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: lhi %r0, 0
-; VECTOR-NEXT: lochie %r0, -1
-; VECTOR-NEXT: vlvgh %v0, %r0, 1
-; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 2
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ltebr %f0, %f0
-; VECTOR-NEXT: vl %v24, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: lochie %r13, -1
-; VECTOR-NEXT: vlvgh %v24, %r13, 5
-; VECTOR-NEXT: lmg %r12, %r15, 288(%r15)
-; VECTOR-NEXT: br %r14
-;
-; SCALAR-LABEL: fun0:
-; SCALAR: # %bb.0:
-; SCALAR-NEXT: stmg %r11, %r15, 88(%r15)
-; SCALAR-NEXT: .cfi_offset %r11, -72
-; SCALAR-NEXT: .cfi_offset %r12, -64
-; SCALAR-NEXT: .cfi_offset %r13, -56
-; SCALAR-NEXT: .cfi_offset %r14, -48
-; SCALAR-NEXT: .cfi_offset %r15, -40
-; SCALAR-NEXT: aghi %r15, -184
-; SCALAR-NEXT: .cfi_def_cfa_offset 344
-; SCALAR-NEXT: std %f8, 176(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f9, 168(%r15) # 8-byte Spill
-; SCALAR-NEXT: std %f10, 160(%r15) # 8-byte Spill
-; SCALAR-NEXT: .cfi_offset %f8, -168
-; SCALAR-NEXT: .cfi_offset %f9, -176
-; SCALAR-NEXT: .cfi_offset %f10, -184
-; SCALAR-NEXT: lgh %r0, 6(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f8, %r0
-; SCALAR-NEXT: lgh %r0, 4(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f9, %r0
-; SCALAR-NEXT: lgh %r0, 2(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f10, %r0
-; SCALAR-NEXT: lgh %r0, 0(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f0, %r0
-; SCALAR-NEXT: # kill: def $f0h killed $f0h killed $f0d
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ltebr %f0, %f0
-; SCALAR-NEXT: ler %f0, %f10
-; SCALAR-NEXT: ipm %r13
-; SCALAR-NEXT: afi %r13, -268435456
-; SCALAR-NEXT: srl %r13, 31
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ltebr %f0, %f0
-; SCALAR-NEXT: ler %f0, %f9
-; SCALAR-NEXT: ipm %r12
-; SCALAR-NEXT: afi %r12, -268435456
-; SCALAR-NEXT: srl %r12, 31
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ltebr %f0, %f0
-; SCALAR-NEXT: ler %f0, %f8
-; SCALAR-NEXT: ipm %r11
-; SCALAR-NEXT: afi %r11, -268435456
-; SCALAR-NEXT: srl %r11, 31
-; SCALAR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; SCALAR-NEXT: ltebr %f0, %f0
-; SCALAR-NEXT: ld %f8, 176(%r15) # 8-byte Reload
-; SCALAR-NEXT: ipm %r5
-; SCALAR-NEXT: ld %f9, 168(%r15) # 8-byte Reload
-; SCALAR-NEXT: ld %f10, 160(%r15) # 8-byte Reload
-; SCALAR-NEXT: afi %r5, -268435456
-; SCALAR-NEXT: srl %r5, 31
-; SCALAR-NEXT: lr %r2, %r13
-; SCALAR-NEXT: lr %r3, %r12
-; SCALAR-NEXT: lr %r4, %r11
-; SCALAR-NEXT: lmg %r11, %r15, 272(%r15)
-; SCALAR-NEXT: br %r14
- %1 = load <4 x half>, ptr %Src
- %2 = fcmp oeq <4 x half> %1, zeroinitializer
- ret <4 x i1> %2
-}
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-mem.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-mem.ll
index 30bbc7de08dd7..b21c538c89ea9 100644
--- a/llvm/test/CodeGen/SystemZ/fp-half-vector-mem.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-mem.ll
@@ -1,102 +1,116 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
-; RUN: | FileCheck %s --check-prefix=VECTOR
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \
-; RUN: | FileCheck %s --check-prefix=SCALAR
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s --check-prefix=VECTOR
+;
+; Test loading-and-store fp16 vectors.
define void @fun0(ptr %Src, ptr %Dst) {
+; CHECK-LABEL: fun0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lgh %r0, 0(%r2)
+; CHECK-NEXT: lgh %r1, 2(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f0, %r0
+; CHECK-NEXT: lgh %r0, 4(%r2)
+; CHECK-NEXT: sllg %r1, %r1, 48
+; CHECK-NEXT: ldgr %f1, %r1
+; CHECK-NEXT: lgh %r1, 6(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f2, %r0
+; CHECK-NEXT: lgh %r0, 8(%r2)
+; CHECK-NEXT: sllg %r1, %r1, 48
+; CHECK-NEXT: ldgr %f3, %r1
+; CHECK-NEXT: lgh %r1, 10(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f4, %r0
+; CHECK-NEXT: lgh %r0, 12(%r2)
+; CHECK-NEXT: sllg %r1, %r1, 48
+; CHECK-NEXT: lgh %r2, 14(%r2)
+; CHECK-NEXT: ldgr %f5, %r1
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f6, %r0
+; CHECK-NEXT: sllg %r0, %r2, 48
+; CHECK-NEXT: ldgr %f7, %r0
+; CHECK-NEXT: lgdr %r0, %f7
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 14(%r3)
+; CHECK-NEXT: lgdr %r0, %f6
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 12(%r3)
+; CHECK-NEXT: lgdr %r0, %f5
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 10(%r3)
+; CHECK-NEXT: lgdr %r0, %f4
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 8(%r3)
+; CHECK-NEXT: lgdr %r0, %f3
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 6(%r3)
+; CHECK-NEXT: lgdr %r0, %f2
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 4(%r3)
+; CHECK-NEXT: lgdr %r0, %f1
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 2(%r3)
+; CHECK-NEXT: lgdr %r0, %f0
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 0(%r3)
+; CHECK-NEXT: br %r14
+;
; VECTOR-LABEL: fun0:
; VECTOR: # %bb.0:
; VECTOR-NEXT: vl %v0, 0(%r2), 3
; VECTOR-NEXT: vst %v0, 0(%r3), 3
; VECTOR-NEXT: br %r14
-;
-; SCALAR-LABEL: fun0:
-; SCALAR: # %bb.0:
-; SCALAR-NEXT: lgh %r0, 0(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f0, %r0
-; SCALAR-NEXT: lgh %r0, 2(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f1, %r0
-; SCALAR-NEXT: lgh %r0, 4(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f2, %r0
-; SCALAR-NEXT: lgh %r0, 6(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f3, %r0
-; SCALAR-NEXT: lgh %r0, 8(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f4, %r0
-; SCALAR-NEXT: lgh %r0, 10(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f5, %r0
-; SCALAR-NEXT: lgh %r0, 12(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f6, %r0
-; SCALAR-NEXT: lgh %r0, 14(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f7, %r0
-; SCALAR-NEXT: lgdr %r0, %f7
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 14(%r3)
-; SCALAR-NEXT: lgdr %r0, %f6
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 12(%r3)
-; SCALAR-NEXT: lgdr %r0, %f5
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 10(%r3)
-; SCALAR-NEXT: lgdr %r0, %f4
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 8(%r3)
-; SCALAR-NEXT: lgdr %r0, %f3
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 6(%r3)
-; SCALAR-NEXT: lgdr %r0, %f2
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 4(%r3)
-; SCALAR-NEXT: lgdr %r0, %f1
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 2(%r3)
-; SCALAR-NEXT: lgdr %r0, %f0
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 0(%r3)
-; SCALAR-NEXT: br %r14
%L = load <8 x half>, ptr %Src
store <8 x half> %L, ptr %Dst
ret void
}
define void @fun1(ptr %Src, ptr %Dst) {
+; CHECK-LABEL: fun1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lgh %r0, 4(%r2)
+; CHECK-NEXT: lgh %r1, 6(%r2)
+; CHECK-NEXT: l %r2, 0(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f0, %r0
+; CHECK-NEXT: sllg %r0, %r1, 48
+; CHECK-NEXT: ldgr %f1, %r0
+; CHECK-NEXT: st %r2, 0(%r3)
+; CHECK-NEXT: lgdr %r0, %f1
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 6(%r3)
+; CHECK-NEXT: lgdr %r0, %f0
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 4(%r3)
+; CHECK-NEXT: br %r14
+;
; VECTOR-LABEL: fun1:
; VECTOR: # %bb.0:
-; VECTOR-NEXT: lg %r0, 0(%r2)
-; VECTOR-NEXT: stg %r0, 0(%r3)
+; VECTOR-NEXT: l %r0, 0(%r2)
+; VECTOR-NEXT: vlreph %v0, 4(%r2)
+; VECTOR-NEXT: vlreph %v1, 6(%r2)
+; VECTOR-NEXT: vsteh %v1, 6(%r3), 0
+; VECTOR-NEXT: vsteh %v0, 4(%r3), 0
+; VECTOR-NEXT: st %r0, 0(%r3)
; VECTOR-NEXT: br %r14
-;
-; SCALAR-LABEL: fun1:
-; SCALAR: # %bb.0:
-; SCALAR-NEXT: lgh %r0, 4(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f0, %r0
-; SCALAR-NEXT: lgh %r0, 6(%r2)
-; SCALAR-NEXT: sllg %r0, %r0, 48
-; SCALAR-NEXT: ldgr %f1, %r0
-; SCALAR-NEXT: l %r0, 0(%r2)
-; SCALAR-NEXT: st %r0, 0(%r3)
-; SCALAR-NEXT: lgdr %r0, %f1
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 6(%r3)
-; SCALAR-NEXT: lgdr %r0, %f0
-; SCALAR-NEXT: srlg %r0, %r0, 48
-; SCALAR-NEXT: sth %r0, 4(%r3)
-; SCALAR-NEXT: br %r14
%L = load <4 x half>, ptr %Src
store <4 x half> %L, ptr %Dst
ret void
}
define void @fun2(ptr %Src, ptr %Dst) {
+; CHECK-LABEL: fun2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lg %r0, 0(%r2)
+; CHECK-NEXT: lg %r1, 8(%r2)
+; CHECK-NEXT: lg %r2, 16(%r2)
+; CHECK-NEXT: stg %r0, 0(%r3)
+; CHECK-NEXT: stg %r1, 8(%r3)
+; CHECK-NEXT: stg %r2, 16(%r3)
+; CHECK-NEXT: br %r14
+;
; VECTOR-LABEL: fun2:
; VECTOR: # %bb.0:
; VECTOR-NEXT: vl %v0, 0(%r2), 4
@@ -104,22 +118,24 @@ define void @fun2(ptr %Src, ptr %Dst) {
; VECTOR-NEXT: lg %r0, 16(%r2)
; VECTOR-NEXT: stg %r0, 16(%r3)
; VECTOR-NEXT: br %r14
-;
-; SCALAR-LABEL: fun2:
-; SCALAR: # %bb.0:
-; SCALAR-NEXT: lg %r0, 16(%r2)
-; SCALAR-NEXT: lg %r1, 8(%r2)
-; SCALAR-NEXT: lg %r2, 0(%r2)
-; SCALAR-NEXT: stg %r2, 0(%r3)
-; SCALAR-NEXT: stg %r1, 8(%r3)
-; SCALAR-NEXT: stg %r0, 16(%r3)
-; SCALAR-NEXT: br %r14
%L = load <12 x half>, ptr %Src
store <12 x half> %L, ptr %Dst
ret void
}
define void @fun3(ptr %Src, ptr %Dst) {
+; CHECK-LABEL: fun3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lg %r0, 24(%r2)
+; CHECK-NEXT: lg %r1, 16(%r2)
+; CHECK-NEXT: lg %r4, 8(%r2)
+; CHECK-NEXT: lg %r2, 0(%r2)
+; CHECK-NEXT: stg %r0, 24(%r3)
+; CHECK-NEXT: stg %r1, 16(%r3)
+; CHECK-NEXT: stg %r4, 8(%r3)
+; CHECK-NEXT: stg %r2, 0(%r3)
+; CHECK-NEXT: br %r14
+;
; VECTOR-LABEL: fun3:
; VECTOR: # %bb.0:
; VECTOR-NEXT: vl %v0, 16(%r2), 4
@@ -127,18 +143,6 @@ define void @fun3(ptr %Src, ptr %Dst) {
; VECTOR-NEXT: vst %v1, 0(%r3), 4
; VECTOR-NEXT: vst %v0, 16(%r3), 4
; VECTOR-NEXT: br %r14
-;
-; SCALAR-LABEL: fun3:
-; SCALAR: # %bb.0:
-; SCALAR-NEXT: lg %r0, 0(%r2)
-; SCALAR-NEXT: lg %r1, 8(%r2)
-; SCALAR-NEXT: lg %r4, 16(%r2)
-; SCALAR-NEXT: lg %r2, 24(%r2)
-; SCALAR-NEXT: stg %r2, 24(%r3)
-; SCALAR-NEXT: stg %r4, 16(%r3)
-; SCALAR-NEXT: stg %r1, 8(%r3)
-; SCALAR-NEXT: stg %r0, 0(%r3)
-; SCALAR-NEXT: br %r14
%L = load <16 x half>, ptr %Src
store <16 x half> %L, ptr %Dst
ret void
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector.ll
deleted file mode 100644
index 824d917444e07..0000000000000
--- a/llvm/test/CodeGen/SystemZ/fp-half-vector.ll
+++ /dev/null
@@ -1,709 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \
-; RUN: | FileCheck %s --check-prefix=NOVEC
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
-; RUN: | FileCheck %s --check-prefix=VECTOR
-
-; Add the <8 x half> argument with itself and return it.
-define <8 x half> @fun0(<8 x half> %Op) {
-; NOVEC-LABEL: fun0:
-; NOVEC: # %bb.0: # %entry
-; NOVEC-NEXT: stmg %r13, %r15, 104(%r15)
-; NOVEC-NEXT: .cfi_offset %r13, -56
-; NOVEC-NEXT: .cfi_offset %r14, -48
-; NOVEC-NEXT: .cfi_offset %r15, -40
-; NOVEC-NEXT: aghi %r15, -224
-; NOVEC-NEXT: .cfi_def_cfa_offset 384
-; NOVEC-NEXT: std %f8, 216(%r15) # 8-byte Spill
-; NOVEC-NEXT: std %f9, 208(%r15) # 8-byte Spill
-; NOVEC-NEXT: std %f10, 200(%r15) # 8-byte Spill
-; NOVEC-NEXT: std %f11, 192(%r15) # 8-byte Spill
-; NOVEC-NEXT: std %f12, 184(%r15) # 8-byte Spill
-; NOVEC-NEXT: std %f13, 176(%r15) # 8-byte Spill
-; NOVEC-NEXT: std %f14, 168(%r15) # 8-byte Spill
-; NOVEC-NEXT: std %f15, 160(%r15) # 8-byte Spill
-; NOVEC-NEXT: .cfi_offset %f8, -168
-; NOVEC-NEXT: .cfi_offset %f9, -176
-; NOVEC-NEXT: .cfi_offset %f10, -184
-; NOVEC-NEXT: .cfi_offset %f11, -192
-; NOVEC-NEXT: .cfi_offset %f12, -200
-; NOVEC-NEXT: .cfi_offset %f13, -208
-; NOVEC-NEXT: .cfi_offset %f14, -216
-; NOVEC-NEXT: .cfi_offset %f15, -224
-; NOVEC-NEXT: lgh %r0, 414(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f14, %r0
-; NOVEC-NEXT: lgh %r0, 406(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f12, %r0
-; NOVEC-NEXT: lgh %r0, 398(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f9, %r0
-; NOVEC-NEXT: lgh %r0, 390(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ler %f10, %f6
-; NOVEC-NEXT: ler %f11, %f4
-; NOVEC-NEXT: ler %f13, %f2
-; NOVEC-NEXT: ler %f15, %f0
-; NOVEC-NEXT: lgr %r13, %r2
-; NOVEC-NEXT: ldgr %f0, %r0
-; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
-; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
-; NOVEC-NEXT: aebr %f0, %f0
-; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
-; NOVEC-NEXT: ler %f8, %f0
-; NOVEC-NEXT: ler %f0, %f9
-; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
-; NOVEC-NEXT: aebr %f0, %f0
-; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
-; NOVEC-NEXT: ler %f9, %f0
-; NOVEC-NEXT: ler %f0, %f12
-; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
-; NOVEC-NEXT: aebr %f0, %f0
-; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
-; NOVEC-NEXT: ler %f12, %f0
-; NOVEC-NEXT: ler %f0, %f14
-; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
-; NOVEC-NEXT: aebr %f0, %f0
-; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
-; NOVEC-NEXT: ler %f14, %f0
-; NOVEC-NEXT: ler %f0, %f15
-; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
-; NOVEC-NEXT: aebr %f0, %f0
-; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
-; NOVEC-NEXT: ler %f15, %f0
-; NOVEC-NEXT: ler %f0, %f13
-; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
-; NOVEC-NEXT: aebr %f0, %f0
-; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
-; NOVEC-NEXT: ler %f13, %f0
-; NOVEC-NEXT: ler %f0, %f11
-; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
-; NOVEC-NEXT: aebr %f0, %f0
-; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
-; NOVEC-NEXT: ler %f11, %f0
-; NOVEC-NEXT: ler %f0, %f10
-; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
-; NOVEC-NEXT: aebr %f0, %f0
-; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
-; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
-; NOVEC-NEXT: lgdr %r0, %f0
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 6(%r13)
-; NOVEC-NEXT: lgdr %r0, %f11
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 4(%r13)
-; NOVEC-NEXT: lgdr %r0, %f13
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 2(%r13)
-; NOVEC-NEXT: lgdr %r0, %f15
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 0(%r13)
-; NOVEC-NEXT: lgdr %r0, %f14
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 14(%r13)
-; NOVEC-NEXT: lgdr %r0, %f12
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 12(%r13)
-; NOVEC-NEXT: lgdr %r0, %f9
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 10(%r13)
-; NOVEC-NEXT: lgdr %r0, %f8
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 8(%r13)
-; NOVEC-NEXT: ld %f8, 216(%r15) # 8-byte Reload
-; NOVEC-NEXT: ld %f9, 208(%r15) # 8-byte Reload
-; NOVEC-NEXT: ld %f10, 200(%r15) # 8-byte Reload
-; NOVEC-NEXT: ld %f11, 192(%r15) # 8-byte Reload
-; NOVEC-NEXT: ld %f12, 184(%r15) # 8-byte Reload
-; NOVEC-NEXT: ld %f13, 176(%r15) # 8-byte Reload
-; NOVEC-NEXT: ld %f14, 168(%r15) # 8-byte Reload
-; NOVEC-NEXT: ld %f15, 160(%r15) # 8-byte Reload
-; NOVEC-NEXT: lmg %r13, %r15, 328(%r15)
-; NOVEC-NEXT: br %r14
-;
-; VECTOR-LABEL: fun0:
-; VECTOR: # %bb.0: # %entry
-; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
-; VECTOR-NEXT: .cfi_offset %r14, -48
-; VECTOR-NEXT: .cfi_offset %r15, -40
-; VECTOR-NEXT: aghi %r15, -224
-; VECTOR-NEXT: .cfi_def_cfa_offset 384
-; VECTOR-NEXT: vst %v24, 160(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vreph %v0, %v24, 7
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: aebr %f0, %f0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 6
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: aebr %f0, %f0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vmrhh %v0, %v0, %v1
-; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 5
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: aebr %f0, %f0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 4
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: aebr %f0, %f0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vmrhh %v0, %v0, %v1
-; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vmrhf %v0, %v0, %v1
-; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 3
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: aebr %f0, %f0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 2
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: aebr %f0, %f0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vmrhh %v0, %v0, %v1
-; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: aebr %f0, %f0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 1
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: aebr %f0, %f0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vmrhh %v0, %v1, %v0
-; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vmrhf %v0, %v0, %v1
-; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vmrhg %v24, %v0, %v1
-; VECTOR-NEXT: lmg %r14, %r15, 336(%r15)
-; VECTOR-NEXT: br %r14
-entry:
- %Res = fadd <8 x half> %Op, %Op
- ret <8 x half> %Res
-}
-
-; Same, but with partial vector values.
-define <4 x half> @fun1(<4 x half> %Op) {
-; NOVEC-LABEL: fun1:
-; NOVEC: # %bb.0: # %entry
-; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
-; NOVEC-NEXT: .cfi_offset %r14, -48
-; NOVEC-NEXT: .cfi_offset %r15, -40
-; NOVEC-NEXT: aghi %r15, -192
-; NOVEC-NEXT: .cfi_def_cfa_offset 352
-; NOVEC-NEXT: std %f8, 184(%r15) # 8-byte Spill
-; NOVEC-NEXT: std %f9, 176(%r15) # 8-byte Spill
-; NOVEC-NEXT: std %f10, 168(%r15) # 8-byte Spill
-; NOVEC-NEXT: std %f11, 160(%r15) # 8-byte Spill
-; NOVEC-NEXT: .cfi_offset %f8, -168
-; NOVEC-NEXT: .cfi_offset %f9, -176
-; NOVEC-NEXT: .cfi_offset %f10, -184
-; NOVEC-NEXT: .cfi_offset %f11, -192
-; NOVEC-NEXT: ler %f8, %f6
-; NOVEC-NEXT: ler %f9, %f4
-; NOVEC-NEXT: ler %f10, %f2
-; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
-; NOVEC-NEXT: aebr %f0, %f0
-; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
-; NOVEC-NEXT: ler %f11, %f0
-; NOVEC-NEXT: ler %f0, %f10
-; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
-; NOVEC-NEXT: aebr %f0, %f0
-; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
-; NOVEC-NEXT: ler %f10, %f0
-; NOVEC-NEXT: ler %f0, %f9
-; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
-; NOVEC-NEXT: aebr %f0, %f0
-; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
-; NOVEC-NEXT: ler %f9, %f0
-; NOVEC-NEXT: ler %f0, %f8
-; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
-; NOVEC-NEXT: aebr %f0, %f0
-; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
-; NOVEC-NEXT: ler %f6, %f0
-; NOVEC-NEXT: ler %f0, %f11
-; NOVEC-NEXT: ler %f2, %f10
-; NOVEC-NEXT: ler %f4, %f9
-; NOVEC-NEXT: ld %f8, 184(%r15) # 8-byte Reload
-; NOVEC-NEXT: ld %f9, 176(%r15) # 8-byte Reload
-; NOVEC-NEXT: ld %f10, 168(%r15) # 8-byte Reload
-; NOVEC-NEXT: ld %f11, 160(%r15) # 8-byte Reload
-; NOVEC-NEXT: lmg %r14, %r15, 304(%r15)
-; NOVEC-NEXT: br %r14
-;
-; VECTOR-LABEL: fun1:
-; VECTOR: # %bb.0: # %entry
-; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
-; VECTOR-NEXT: .cfi_offset %r14, -48
-; VECTOR-NEXT: .cfi_offset %r15, -40
-; VECTOR-NEXT: aghi %r15, -224
-; VECTOR-NEXT: .cfi_def_cfa_offset 384
-; VECTOR-NEXT: vst %v24, 160(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vreph %v0, %v24, 7
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: aebr %f0, %f0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 6
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: aebr %f0, %f0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vmrhh %v0, %v0, %v1
-; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 5
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: aebr %f0, %f0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 4
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: aebr %f0, %f0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vmrhh %v0, %v0, %v1
-; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vmrhf %v0, %v0, %v1
-; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 3
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: aebr %f0, %f0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 2
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: aebr %f0, %f0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vmrhh %v0, %v0, %v1
-; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: aebr %f0, %f0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 1
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: aebr %f0, %f0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vmrhh %v0, %v1, %v0
-; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vmrhf %v0, %v0, %v1
-; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vmrhg %v24, %v0, %v1
-; VECTOR-NEXT: lmg %r14, %r15, 336(%r15)
-; VECTOR-NEXT: br %r14
-entry:
- %Res = fadd <4 x half> %Op, %Op
- ret <4 x half> %Res
-}
-
-; Test a vector extension.
-define <2 x half> @fun2(<2 x half> %Op) {
-; NOVEC-LABEL: fun2:
-; NOVEC: # %bb.0: # %entry
-; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
-; NOVEC-NEXT: .cfi_offset %r14, -48
-; NOVEC-NEXT: .cfi_offset %r15, -40
-; NOVEC-NEXT: aghi %r15, -176
-; NOVEC-NEXT: .cfi_def_cfa_offset 336
-; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Spill
-; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Spill
-; NOVEC-NEXT: .cfi_offset %f8, -168
-; NOVEC-NEXT: .cfi_offset %f9, -176
-; NOVEC-NEXT: ler %f8, %f2
-; NOVEC-NEXT: brasl %r14, __extendhfdf2 at PLT
-; NOVEC-NEXT: ldr %f9, %f0
-; NOVEC-NEXT: ler %f0, %f8
-; NOVEC-NEXT: brasl %r14, __extendhfdf2 at PLT
-; NOVEC-NEXT: adbr %f9, %f9
-; NOVEC-NEXT: ldr %f8, %f0
-; NOVEC-NEXT: adbr %f8, %f0
-; NOVEC-NEXT: ldr %f0, %f9
-; NOVEC-NEXT: brasl %r14, __truncdfhf2 at PLT
-; NOVEC-NEXT: ler %f9, %f0
-; NOVEC-NEXT: ldr %f0, %f8
-; NOVEC-NEXT: brasl %r14, __truncdfhf2 at PLT
-; NOVEC-NEXT: ler %f2, %f0
-; NOVEC-NEXT: ler %f0, %f9
-; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Reload
-; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Reload
-; NOVEC-NEXT: lmg %r14, %r15, 288(%r15)
-; NOVEC-NEXT: br %r14
-;
-; VECTOR-LABEL: fun2:
-; VECTOR: # %bb.0: # %entry
-; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
-; VECTOR-NEXT: .cfi_offset %r14, -48
-; VECTOR-NEXT: .cfi_offset %r15, -40
-; VECTOR-NEXT: aghi %r15, -192
-; VECTOR-NEXT: .cfi_def_cfa_offset 352
-; VECTOR-NEXT: vlr %v0, %v24
-; VECTOR-NEXT: vst %v24, 160(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfdf2 at PLT
-; VECTOR-NEXT: # kill: def $f0d killed $f0d def $v0
-; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vreph %v0, %v0, 1
-; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0
-; VECTOR-NEXT: brasl %r14, __extendhfdf2 at PLT
-; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0d killed $f0d def $v0
-; VECTOR-NEXT: vmrhg %v0, %v1, %v0
-; VECTOR-NEXT: vfadb %v0, %v0, %v0
-; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0
-; VECTOR-NEXT: brasl %r14, __truncdfhf2 at PLT
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill
-; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: vrepg %v0, %v0, 1
-; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0
-; VECTOR-NEXT: brasl %r14, __truncdfhf2 at PLT
-; VECTOR-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload
-; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
-; VECTOR-NEXT: vmrhh %v0, %v1, %v0
-; VECTOR-NEXT: vmrhf %v0, %v0, %v0
-; VECTOR-NEXT: vmrhf %v1, %v0, %v0
-; VECTOR-NEXT: vmrhg %v24, %v0, %v1
-; VECTOR-NEXT: lmg %r14, %r15, 304(%r15)
-; VECTOR-NEXT: br %r14
-entry:
- %E = fpext <2 x half> %Op to <2 x double>
- %Add = fadd <2 x double> %E, %E
- %Res = fptrunc <2 x double> %Add to <2 x half>
- ret <2 x half> %Res
-}
-
-; Load and store an <8 x half> vector.
-define void @fun3(ptr %Src, ptr %Dst) {
-; NOVEC-LABEL: fun3:
-; NOVEC: # %bb.0: # %entry
-; NOVEC-NEXT: lgh %r0, 0(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f0, %r0
-; NOVEC-NEXT: lgh %r0, 2(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f1, %r0
-; NOVEC-NEXT: lgh %r0, 4(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f2, %r0
-; NOVEC-NEXT: lgh %r0, 6(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f3, %r0
-; NOVEC-NEXT: lgh %r0, 8(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f4, %r0
-; NOVEC-NEXT: lgh %r0, 10(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f5, %r0
-; NOVEC-NEXT: lgh %r0, 12(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f6, %r0
-; NOVEC-NEXT: lgh %r0, 14(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f7, %r0
-; NOVEC-NEXT: lgdr %r0, %f7
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 14(%r3)
-; NOVEC-NEXT: lgdr %r0, %f6
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 12(%r3)
-; NOVEC-NEXT: lgdr %r0, %f5
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 10(%r3)
-; NOVEC-NEXT: lgdr %r0, %f4
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 8(%r3)
-; NOVEC-NEXT: lgdr %r0, %f3
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 6(%r3)
-; NOVEC-NEXT: lgdr %r0, %f2
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 4(%r3)
-; NOVEC-NEXT: lgdr %r0, %f1
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 2(%r3)
-; NOVEC-NEXT: lgdr %r0, %f0
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 0(%r3)
-; NOVEC-NEXT: br %r14
-;
-; VECTOR-LABEL: fun3:
-; VECTOR: # %bb.0: # %entry
-; VECTOR-NEXT: vl %v0, 0(%r2), 3
-; VECTOR-NEXT: vst %v0, 0(%r3), 3
-; VECTOR-NEXT: br %r14
-entry:
- %L = load <8 x half>, ptr %Src
- store <8 x half> %L, ptr %Dst
- ret void
-}
-
-; Call a function with <8 x half> argument and return values.
-declare <8 x half> @foo(<8 x half>)
-define void @fun4(ptr %Src, ptr %Dst) {
-; NOVEC-LABEL: fun4:
-; NOVEC: # %bb.0: # %entry
-; NOVEC-NEXT: stmg %r13, %r15, 104(%r15)
-; NOVEC-NEXT: .cfi_offset %r13, -56
-; NOVEC-NEXT: .cfi_offset %r14, -48
-; NOVEC-NEXT: .cfi_offset %r15, -40
-; NOVEC-NEXT: aghi %r15, -208
-; NOVEC-NEXT: .cfi_def_cfa_offset 368
-; NOVEC-NEXT: lgh %r0, 0(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f0, %r0
-; NOVEC-NEXT: lgh %r0, 2(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f2, %r0
-; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
-; NOVEC-NEXT: # kill: def $f2h killed $f2h killed $f2d
-; NOVEC-NEXT: lgh %r0, 4(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f4, %r0
-; NOVEC-NEXT: # kill: def $f4h killed $f4h killed $f4d
-; NOVEC-NEXT: lgh %r0, 6(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f6, %r0
-; NOVEC-NEXT: # kill: def $f6h killed $f6h killed $f6d
-; NOVEC-NEXT: lgh %r0, 8(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f1, %r0
-; NOVEC-NEXT: lgh %r0, 10(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f3, %r0
-; NOVEC-NEXT: lgh %r0, 12(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f5, %r0
-; NOVEC-NEXT: lgh %r0, 14(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f7, %r0
-; NOVEC-NEXT: lgdr %r0, %f7
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 190(%r15)
-; NOVEC-NEXT: lgdr %r0, %f5
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 182(%r15)
-; NOVEC-NEXT: lgdr %r0, %f3
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 174(%r15)
-; NOVEC-NEXT: lgdr %r0, %f1
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: la %r2, 192(%r15)
-; NOVEC-NEXT: lgr %r13, %r3
-; NOVEC-NEXT: sth %r0, 166(%r15)
-; NOVEC-NEXT: brasl %r14, foo at PLT
-; NOVEC-NEXT: lgh %r0, 192(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f0, %r0
-; NOVEC-NEXT: lgh %r0, 194(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f1, %r0
-; NOVEC-NEXT: lgh %r0, 196(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f2, %r0
-; NOVEC-NEXT: lgh %r0, 198(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f3, %r0
-; NOVEC-NEXT: lgh %r0, 200(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f4, %r0
-; NOVEC-NEXT: lgh %r0, 202(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f5, %r0
-; NOVEC-NEXT: lgh %r0, 204(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f6, %r0
-; NOVEC-NEXT: lgh %r0, 206(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f7, %r0
-; NOVEC-NEXT: lgdr %r0, %f7
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 14(%r13)
-; NOVEC-NEXT: lgdr %r0, %f6
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 12(%r13)
-; NOVEC-NEXT: lgdr %r0, %f5
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 10(%r13)
-; NOVEC-NEXT: lgdr %r0, %f4
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 8(%r13)
-; NOVEC-NEXT: lgdr %r0, %f3
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 6(%r13)
-; NOVEC-NEXT: lgdr %r0, %f2
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 4(%r13)
-; NOVEC-NEXT: lgdr %r0, %f1
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 2(%r13)
-; NOVEC-NEXT: lgdr %r0, %f0
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 0(%r13)
-; NOVEC-NEXT: lmg %r13, %r15, 312(%r15)
-; NOVEC-NEXT: br %r14
-;
-; VECTOR-LABEL: fun4:
-; VECTOR: # %bb.0: # %entry
-; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
-; VECTOR-NEXT: .cfi_offset %r13, -56
-; VECTOR-NEXT: .cfi_offset %r14, -48
-; VECTOR-NEXT: .cfi_offset %r15, -40
-; VECTOR-NEXT: aghi %r15, -160
-; VECTOR-NEXT: .cfi_def_cfa_offset 320
-; VECTOR-NEXT: vl %v24, 0(%r2), 3
-; VECTOR-NEXT: lgr %r13, %r3
-; VECTOR-NEXT: brasl %r14, foo at PLT
-; VECTOR-NEXT: vst %v24, 0(%r13), 3
-; VECTOR-NEXT: lmg %r13, %r15, 264(%r15)
-; VECTOR-NEXT: br %r14
-entry:
- %arg = load <8 x half>, ptr %Src
- %Res = call <8 x half> @foo(<8 x half> %arg)
- store <8 x half> %Res, ptr %Dst
- ret void
-}
-
-; Receive and pass argument fully on stack.
-declare void @foo2(<4 x half> %dummy, <8 x half> %Arg5)
-define void @fun5(<4 x half> %dummy, <8 x half> %Arg5) {
-; NOVEC-LABEL: fun5:
-; NOVEC: # %bb.0:
-; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
-; NOVEC-NEXT: .cfi_offset %r14, -48
-; NOVEC-NEXT: .cfi_offset %r15, -40
-; NOVEC-NEXT: aghi %r15, -256
-; NOVEC-NEXT: .cfi_def_cfa_offset 416
-; NOVEC-NEXT: std %f8, 248(%r15) # 8-byte Spill
-; NOVEC-NEXT: std %f9, 240(%r15) # 8-byte Spill
-; NOVEC-NEXT: std %f10, 232(%r15) # 8-byte Spill
-; NOVEC-NEXT: std %f11, 224(%r15) # 8-byte Spill
-; NOVEC-NEXT: .cfi_offset %f8, -168
-; NOVEC-NEXT: .cfi_offset %f9, -176
-; NOVEC-NEXT: .cfi_offset %f10, -184
-; NOVEC-NEXT: .cfi_offset %f11, -192
-; NOVEC-NEXT: lgh %r0, 422(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f1, %r0
-; NOVEC-NEXT: lgh %r0, 430(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f3, %r0
-; NOVEC-NEXT: lgh %r0, 438(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f5, %r0
-; NOVEC-NEXT: lgh %r0, 446(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f7, %r0
-; NOVEC-NEXT: lgh %r0, 454(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f8, %r0
-; NOVEC-NEXT: lgh %r0, 462(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f9, %r0
-; NOVEC-NEXT: lgh %r0, 470(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f10, %r0
-; NOVEC-NEXT: lgh %r0, 478(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f11, %r0
-; NOVEC-NEXT: lgdr %r0, %f11
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 222(%r15)
-; NOVEC-NEXT: lgdr %r0, %f10
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 214(%r15)
-; NOVEC-NEXT: lgdr %r0, %f9
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 206(%r15)
-; NOVEC-NEXT: lgdr %r0, %f8
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 198(%r15)
-; NOVEC-NEXT: lgdr %r0, %f7
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 190(%r15)
-; NOVEC-NEXT: lgdr %r0, %f5
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 182(%r15)
-; NOVEC-NEXT: lgdr %r0, %f3
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 174(%r15)
-; NOVEC-NEXT: lgdr %r0, %f1
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 166(%r15)
-; NOVEC-NEXT: brasl %r14, foo2 at PLT
-; NOVEC-NEXT: ld %f8, 248(%r15) # 8-byte Reload
-; NOVEC-NEXT: ld %f9, 240(%r15) # 8-byte Reload
-; NOVEC-NEXT: ld %f10, 232(%r15) # 8-byte Reload
-; NOVEC-NEXT: ld %f11, 224(%r15) # 8-byte Reload
-; NOVEC-NEXT: lmg %r14, %r15, 368(%r15)
-; NOVEC-NEXT: br %r14
-;
-; VECTOR-LABEL: fun5:
-; VECTOR: # %bb.0:
-; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
-; VECTOR-NEXT: .cfi_offset %r14, -48
-; VECTOR-NEXT: .cfi_offset %r15, -40
-; VECTOR-NEXT: aghi %r15, -160
-; VECTOR-NEXT: .cfi_def_cfa_offset 320
-; VECTOR-NEXT: brasl %r14, foo2 at PLT
-; VECTOR-NEXT: lmg %r14, %r15, 272(%r15)
-; VECTOR-NEXT: br %r14
- call void @foo2(<4 x half> %dummy, <8 x half> %Arg5)
- ret void
-}
>From 58ce7d8e9b1aa9edd3a75d82f9b31f30e7aaf293 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson at linux.vnet.ibm.com>
Date: Mon, 8 Dec 2025 16:10:49 +0100
Subject: [PATCH 3/4] Updates per buildbot recommendations (format, undef).
---
llvm/lib/Target/SystemZ/SystemZISelLowering.cpp | 12 ++++++------
llvm/lib/Target/SystemZ/SystemZISelLowering.h | 3 ++-
llvm/test/CodeGen/SystemZ/fp-half-vector-conv.ll | 2 +-
3 files changed, 9 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index aaa6c22eaf01a..ccd99eeb80ef1 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -845,8 +845,9 @@ bool SystemZTargetLowering::useSoftFloat() const {
return Subtarget.hasSoftFloat();
}
-unsigned SystemZTargetLowering::getNumRegisters(LLVMContext &Context, EVT VT,
- std::optional<MVT> RegisterVT) const {
+unsigned
+SystemZTargetLowering::getNumRegisters(LLVMContext &Context, EVT VT,
+ std::optional<MVT> RegisterVT) const {
// i128 inline assembly operand.
if (VT == MVT::i128 && RegisterVT && *RegisterVT == MVT::Untyped)
return 1;
@@ -6391,13 +6392,12 @@ static SDValue mergeHighParts(SelectionDAG &DAG, const SDLoc &DL,
"Handling full vectors only.");
Op0 = DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0);
Op1 = DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op1);
- SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH,
- DL, IntVecVT, Op0, Op1);
+ SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH, DL, IntVecVT, Op0, Op1);
return DAG.getNode(ISD::BITCAST, DL, VT, Op);
}
-static SDValue buildFPVecFromScalars4(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
- SmallVectorImpl<SDValue> &Elems,
+static SDValue buildFPVecFromScalars4(SelectionDAG &DAG, const SDLoc &DL,
+ EVT VT, SmallVectorImpl<SDValue> &Elems,
unsigned Pos) {
SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[Pos + 0], Elems[Pos + 1]);
SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[Pos + 2], Elems[Pos + 3]);
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index ca47b96ef2d80..9ea7f3e556971 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -68,7 +68,8 @@ class SystemZTargetLowering : public TargetLowering {
// Expand (narrow) f16 vectors during type legalization to avoid
// operations for all elements as with expansion after widening.
if (VT.getScalarType() == MVT::f16)
- return VT.getVectorElementCount().isScalar() ? TypeScalarizeVector : TypeSplitVector;
+ return VT.getVectorElementCount().isScalar() ? TypeScalarizeVector
+ : TypeSplitVector;
if (VT.getScalarSizeInBits() % 8 == 0)
return TypeWidenVector;
return TargetLoweringBase::getPreferredVectorAction(VT);
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-conv.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-conv.ll
index 2f1872fe1ac84..d19f393bfa11a 100644
--- a/llvm/test/CodeGen/SystemZ/fp-half-vector-conv.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-conv.ll
@@ -142,7 +142,7 @@ define <2 x double> @f3(<4 x half> %vec) {
; VECTOR-NEXT: vmrhg %v24, %v1, %v0
; VECTOR-NEXT: lmg %r14, %r15, 304(%r15)
; VECTOR-NEXT: br %r14
- %shuffle = shufflevector <4 x half> %vec, <4 x half> undef, <2 x i32> <i32 0, i32 2>
+ %shuffle = shufflevector <4 x half> %vec, <4 x half> %vec, <2 x i32> <i32 0, i32 2>
%res = fpext <2 x half> %shuffle to <2 x double>
ret <2 x double> %res
}
>From afbf47f9448b6696e05a921eed5d3f5f079ef237 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson at linux.vnet.ibm.com>
Date: Mon, 8 Dec 2025 17:45:00 +0100
Subject: [PATCH 4/4] Handling of half buildvectors further improved with
tests.
---
.../Target/SystemZ/SystemZISelLowering.cpp | 10 +-
.../CodeGen/SystemZ/fp-half-vector-args.ll | 6 +-
.../CodeGen/SystemZ/fp-half-vector-move.ll | 99 +++++++++++++++++++
3 files changed, 110 insertions(+), 5 deletions(-)
create mode 100644 llvm/test/CodeGen/SystemZ/fp-half-vector-move.ll
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index ccd99eeb80ef1..6dc3042d56d8a 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -6402,8 +6402,11 @@ static SDValue buildFPVecFromScalars4(SelectionDAG &DAG, const SDLoc &DL,
SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[Pos + 0], Elems[Pos + 1]);
SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[Pos + 2], Elems[Pos + 3]);
// Avoid unnecessary undefs by reusing the other operand.
- if (Op01.isUndef())
+ if (Op01.isUndef()) {
+ if (Op23.isUndef())
+ return Op01;
Op01 = Op23;
+ }
else if (Op23.isUndef())
Op23 = Op01;
// Merging identical replications is a no-op.
@@ -6478,6 +6481,11 @@ SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
if (VT == MVT::v8f16 && !AllLoads) {
SDValue Op0123 = buildFPVecFromScalars4(DAG, DL, VT, Elems, 0);
SDValue Op4567 = buildFPVecFromScalars4(DAG, DL, VT, Elems, 4);
+ // Avoid unnecessary undefs by reusing the other operand.
+ if (Op0123.isUndef())
+ Op0123 = Op4567;
+ else if (Op4567.isUndef())
+ Op4567 = Op0123;
// Merging identical replications is a no-op.
if (Op0123.getOpcode() == SystemZISD::REPLICATE && Op0123 == Op4567)
return Op0123;
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-args.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-args.ll
index 381bfad51188f..8d28e8317bb62 100644
--- a/llvm/test/CodeGen/SystemZ/fp-half-vector-args.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-args.ll
@@ -270,8 +270,7 @@ define void @fun1_call() {
; VECTOR-NEXT: vmrhh %v2, %v2, %v3
; VECTOR-NEXT: vmrhh %v0, %v0, %v1
; VECTOR-NEXT: vmrhf %v0, %v0, %v2
-; VECTOR-NEXT: vmrhf %v1, %v0, %v0
-; VECTOR-NEXT: vmrhg %v24, %v0, %v1
+; VECTOR-NEXT: vmrhg %v24, %v0, %v0
; VECTOR-NEXT: brasl %r14, Fnptr at PLT
; VECTOR-NEXT: lmg %r14, %r15, 272(%r15)
; VECTOR-NEXT: br %r14
@@ -312,8 +311,7 @@ define %Ty1 @fun1_ret() {
; VECTOR-NEXT: vmrhh %v2, %v2, %v3
; VECTOR-NEXT: vmrhh %v0, %v0, %v1
; VECTOR-NEXT: vmrhf %v0, %v0, %v2
-; VECTOR-NEXT: vmrhf %v1, %v0, %v0
-; VECTOR-NEXT: vmrhg %v24, %v0, %v1
+; VECTOR-NEXT: vmrhg %v24, %v0, %v0
; VECTOR-NEXT: br %r14
%L = load %Ty1, ptr @Src
ret %Ty1 %L
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-move.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-move.ll
new file mode 100644
index 0000000000000..48d2f4b60c62f
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-move.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s --check-prefix=VECTOR
+;
+; Test insertions into fp16 undef vectors.
+
+define <8 x half> @f0(half %val) {
+; CHECK-LABEL: f0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d
+; CHECK-NEXT: lgdr %r0, %f0
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 4(%r2)
+; CHECK-NEXT: br %r14
+;
+; VECTOR-LABEL: f0:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vreph %v24, %v0, 0
+; VECTOR-NEXT: br %r14
+ %ret = insertelement <8 x half> undef, half %val, i32 2
+ ret <8 x half> %ret
+}
+
+define <8 x half> @f1(half %val) {
+; CHECK-LABEL: f1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d
+; CHECK-NEXT: lgdr %r0, %f0
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 6(%r2)
+; CHECK-NEXT: sth %r0, 4(%r2)
+; CHECK-NEXT: br %r14
+;
+; VECTOR-LABEL: f1:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vreph %v24, %v0, 0
+; VECTOR-NEXT: br %r14
+ %v0 = insertelement <8 x half> undef, half %val, i32 2
+ %ret = insertelement <8 x half> %v0, half %val, i32 3
+ ret <8 x half> %ret
+}
+
+define <8 x half> @f2(half %val0, half %val1) {
+; CHECK-LABEL: f2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $f2h killed $f2h def $f2d
+; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d
+; CHECK-NEXT: lgdr %r0, %f2
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 6(%r2)
+; CHECK-NEXT: lgdr %r0, %f0
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 4(%r2)
+; CHECK-NEXT: br %r14
+;
+; VECTOR-LABEL: f2:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: # kill: def $f2h killed $f2h def $v2
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v0, %v2
+; VECTOR-NEXT: vmrhf %v0, %v0, %v0
+; VECTOR-NEXT: vmrhg %v24, %v0, %v0
+; VECTOR-NEXT: br %r14
+ %v0 = insertelement <8 x half> undef, half %val0, i32 2
+ %ret = insertelement <8 x half> %v0, half %val1, i32 3
+ ret <8 x half> %ret
+}
+
+define <8 x half> @f3(half %val0, half %val1) {
+; CHECK-LABEL: f3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $f2h killed $f2h def $f2d
+; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d
+; CHECK-NEXT: lgdr %r0, %f2
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 10(%r2)
+; CHECK-NEXT: lgdr %r1, %f0
+; CHECK-NEXT: srlg %r1, %r1, 48
+; CHECK-NEXT: sth %r1, 8(%r2)
+; CHECK-NEXT: sth %r0, 6(%r2)
+; CHECK-NEXT: sth %r1, 4(%r2)
+; CHECK-NEXT: br %r14
+;
+; VECTOR-LABEL: f3:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: # kill: def $f2h killed $f2h def $v2
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0
+; VECTOR-NEXT: vmrhh %v0, %v0, %v2
+; VECTOR-NEXT: vmrhf %v0, %v0, %v0
+; VECTOR-NEXT: vmrhg %v24, %v0, %v0
+; VECTOR-NEXT: br %r14
+ %v0 = insertelement <8 x half> undef, half %val0, i32 2
+ %v1 = insertelement <8 x half> %v0, half %val1, i32 3
+ %v2 = insertelement <8 x half> %v1, half %val0, i32 4
+ %ret = insertelement <8 x half> %v2, half %val1, i32 5
+ ret <8 x half> %ret
+}
More information about the llvm-commits
mailing list