[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)
Jonas Paulsson via cfe-commits
cfe-commits at lists.llvm.org
Wed Dec 4 16:40:01 PST 2024
https://github.com/JonPsson1 updated https://github.com/llvm/llvm-project/pull/109164
>From b7c8c192cdea6a8bd128a6e178e1d8d5d2fb865a Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Tue, 17 Sep 2024 19:34:34 +0200
Subject: [PATCH 1/3] Initial experiments (with integer regs for fp16).
Experiment with soft-promotion in FP regs (not working). Try to make f16
legal instead Atomic loads/stores, spill/reload, tests for __fp16 and half
vectors. strict f16 with tests. Review Make use of vector facility if
present.
---
clang/docs/LanguageExtensions.rst | 1 +
clang/lib/Basic/Targets/SystemZ.h | 15 +
clang/lib/CodeGen/Targets/SystemZ.cpp | 11 +-
clang/test/CodeGen/SystemZ/Float16.c | 85 ++
clang/test/CodeGen/SystemZ/fp16.c | 39 +
clang/test/CodeGen/SystemZ/systemz-abi.c | 44 +
compiler-rt/test/builtins/CMakeLists.txt | 2 +-
llvm/lib/IR/RuntimeLibcalls.cpp | 5 +
.../SystemZ/AsmParser/SystemZAsmParser.cpp | 14 +
.../MCTargetDesc/SystemZMCTargetDesc.cpp | 19 +
.../MCTargetDesc/SystemZMCTargetDesc.h | 2 +
llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp | 8 +
llvm/lib/Target/SystemZ/SystemZCallingConv.td | 4 +-
.../Target/SystemZ/SystemZISelDAGToDAG.cpp | 7 +-
.../Target/SystemZ/SystemZISelLowering.cpp | 127 ++-
llvm/lib/Target/SystemZ/SystemZISelLowering.h | 6 +
llvm/lib/Target/SystemZ/SystemZInstrFP.td | 14 +-
llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 52 ++
llvm/lib/Target/SystemZ/SystemZInstrVector.td | 2 +
.../lib/Target/SystemZ/SystemZRegisterInfo.td | 25 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ13.td | 12 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ14.td | 12 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ15.td | 12 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ16.td | 12 +-
.../lib/Target/SystemZ/SystemZScheduleZ196.td | 8 +-
.../Target/SystemZ/SystemZScheduleZEC12.td | 8 +-
llvm/test/CodeGen/SystemZ/atomic-load-10.ll | 22 +
llvm/test/CodeGen/SystemZ/atomic-store-10.ll | 24 +
llvm/test/CodeGen/SystemZ/fp-half-libcall.ll | 312 +++++++
llvm/test/CodeGen/SystemZ/fp-half-strict.ll | 209 +++++
llvm/test/CodeGen/SystemZ/fp-half-vector.ll | 797 ++++++++++++++++++
llvm/test/CodeGen/SystemZ/fp-half.ll | 627 ++++++++++++++
llvm/test/CodeGen/SystemZ/fp-round-03.ll | 15 +-
llvm/test/CodeGen/SystemZ/spill-half-01.mir | 47 ++
llvm/test/CodeGen/SystemZ/spill-half-02.mir | 40 +
llvm/test/CodeGen/SystemZ/twoaddr-kill.mir | 8 +-
36 files changed, 2590 insertions(+), 57 deletions(-)
create mode 100644 clang/test/CodeGen/SystemZ/Float16.c
create mode 100644 clang/test/CodeGen/SystemZ/fp16.c
create mode 100644 llvm/test/CodeGen/SystemZ/atomic-load-10.ll
create mode 100644 llvm/test/CodeGen/SystemZ/atomic-store-10.ll
create mode 100644 llvm/test/CodeGen/SystemZ/fp-half-libcall.ll
create mode 100644 llvm/test/CodeGen/SystemZ/fp-half-strict.ll
create mode 100644 llvm/test/CodeGen/SystemZ/fp-half-vector.ll
create mode 100644 llvm/test/CodeGen/SystemZ/fp-half.ll
create mode 100644 llvm/test/CodeGen/SystemZ/spill-half-01.mir
create mode 100644 llvm/test/CodeGen/SystemZ/spill-half-02.mir
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index c053a5ab3c528c7..0bec9b38053823f 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -872,6 +872,7 @@ to ``float``; see below for more information on this emulation.
* SPIR (natively)
* X86 (if SSE2 is available; natively if AVX512-FP16 is also available)
* RISC-V (natively if Zfh or Zhinx is available)
+ * SystemZ (emulated)
* ``__bf16`` is supported on the following targets (currently never natively):
diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h
index ef9a07033a6e4ff..b4da2c9ce64754a 100644
--- a/clang/lib/Basic/Targets/SystemZ.h
+++ b/clang/lib/Basic/Targets/SystemZ.h
@@ -91,11 +91,26 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
"-v128:64-a:8:16-n32:64");
}
MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 128;
+
+ // True if the backend supports operations on the half LLVM IR type.
+ // By setting this to false, conversions will happen for _Float16 around
+ // a statement by default, with operations done in float. However, if
+ // -ffloat16-excess-precision=none is given, no conversions will be made
+ // and instead the backend will promote each half operation to float
+ // individually.
+ HasLegalHalfType = false;
+ // Support _Float16.
+ HasFloat16 = true;
+
HasStrictFP = true;
}
unsigned getMinGlobalAlign(uint64_t Size, bool HasNonWeakDef) const override;
+ bool useFP16ConversionIntrinsics() const override {
+ return false;
+ }
+
void getTargetDefines(const LangOptions &Opts,
MacroBuilder &Builder) const override;
diff --git a/clang/lib/CodeGen/Targets/SystemZ.cpp b/clang/lib/CodeGen/Targets/SystemZ.cpp
index 23c96fa5cf98cb3..021d764dbfd063e 100644
--- a/clang/lib/CodeGen/Targets/SystemZ.cpp
+++ b/clang/lib/CodeGen/Targets/SystemZ.cpp
@@ -185,6 +185,7 @@ bool SystemZABIInfo::isFPArgumentType(QualType Ty) const {
if (const BuiltinType *BT = Ty->getAs<BuiltinType>())
switch (BT->getKind()) {
+ case BuiltinType::Float16: // _Float16
case BuiltinType::Float:
case BuiltinType::Double:
return true;
@@ -277,7 +278,8 @@ RValue SystemZABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
} else {
if (AI.getCoerceToType())
ArgTy = AI.getCoerceToType();
- InFPRs = (!IsSoftFloatABI && (ArgTy->isFloatTy() || ArgTy->isDoubleTy()));
+ InFPRs = (!IsSoftFloatABI &&
+ (ArgTy->isHalfTy() || ArgTy->isFloatTy() || ArgTy->isDoubleTy()));
IsVector = ArgTy->isVectorTy();
UnpaddedSize = TyInfo.Width;
DirectAlign = TyInfo.Align;
@@ -446,10 +448,11 @@ ABIArgInfo SystemZABIInfo::classifyArgumentType(QualType Ty) const {
// The structure is passed as an unextended integer, a float, or a double.
if (isFPArgumentType(SingleElementTy)) {
- assert(Size == 32 || Size == 64);
+ assert(Size == 16 || Size == 32 || Size == 64);
return ABIArgInfo::getDirect(
- Size == 32 ? llvm::Type::getFloatTy(getVMContext())
- : llvm::Type::getDoubleTy(getVMContext()));
+ Size == 16 ? llvm::Type::getHalfTy(getVMContext())
+ : Size == 32 ? llvm::Type::getFloatTy(getVMContext())
+ : llvm::Type::getDoubleTy(getVMContext()));
} else {
llvm::IntegerType *PassTy = llvm::IntegerType::get(getVMContext(), Size);
return Size <= 32 ? ABIArgInfo::getNoExtend(PassTy)
diff --git a/clang/test/CodeGen/SystemZ/Float16.c b/clang/test/CodeGen/SystemZ/Float16.c
new file mode 100644
index 000000000000000..4444dbdcc23ca05
--- /dev/null
+++ b/clang/test/CodeGen/SystemZ/Float16.c
@@ -0,0 +1,85 @@
+// RUN: %clang_cc1 -triple s390x-linux-gnu \
+// RUN: -ffloat16-excess-precision=standard -emit-llvm -o - %s \
+// RUN: | FileCheck %s -check-prefix=STANDARD
+
+// RUN: %clang_cc1 -triple s390x-linux-gnu \
+// RUN: -ffloat16-excess-precision=none -emit-llvm -o - %s \
+// RUN: | FileCheck %s -check-prefix=NONE
+
+// RUN: %clang_cc1 -triple s390x-linux-gnu \
+// RUN: -ffloat16-excess-precision=fast -emit-llvm -o - %s \
+// RUN: | FileCheck %s -check-prefix=FAST
+
+_Float16 f(_Float16 a, _Float16 b, _Float16 c, _Float16 d) {
+ return a * b + c * d;
+}
+
+// STANDARD-LABEL: define dso_local half @f(half noundef %a, half noundef %b, half noundef %c, half noundef %d) #0 {
+// STANDARD-NEXT: entry:
+// STANDARD-NEXT: %a.addr = alloca half, align 2
+// STANDARD-NEXT: %b.addr = alloca half, align 2
+// STANDARD-NEXT: %c.addr = alloca half, align 2
+// STANDARD-NEXT: %d.addr = alloca half, align 2
+// STANDARD-NEXT: store half %a, ptr %a.addr, align 2
+// STANDARD-NEXT: store half %b, ptr %b.addr, align 2
+// STANDARD-NEXT: store half %c, ptr %c.addr, align 2
+// STANDARD-NEXT: store half %d, ptr %d.addr, align 2
+// STANDARD-NEXT: %0 = load half, ptr %a.addr, align 2
+// STANDARD-NEXT: %ext = fpext half %0 to float
+// STANDARD-NEXT: %1 = load half, ptr %b.addr, align 2
+// STANDARD-NEXT: %ext1 = fpext half %1 to float
+// STANDARD-NEXT: %mul = fmul float %ext, %ext1
+// STANDARD-NEXT: %2 = load half, ptr %c.addr, align 2
+// STANDARD-NEXT: %ext2 = fpext half %2 to float
+// STANDARD-NEXT: %3 = load half, ptr %d.addr, align 2
+// STANDARD-NEXT: %ext3 = fpext half %3 to float
+// STANDARD-NEXT: %mul4 = fmul float %ext2, %ext3
+// STANDARD-NEXT: %add = fadd float %mul, %mul4
+// STANDARD-NEXT: %unpromotion = fptrunc float %add to half
+// STANDARD-NEXT: ret half %unpromotion
+// STANDARD-NEXT: }
+
+// NONE-LABEL: define dso_local half @f(half noundef %a, half noundef %b, half noundef %c, half noundef %d) #0 {
+// NONE-NEXT: entry:
+// NONE-NEXT: %a.addr = alloca half, align 2
+// NONE-NEXT: %b.addr = alloca half, align 2
+// NONE-NEXT: %c.addr = alloca half, align 2
+// NONE-NEXT: %d.addr = alloca half, align 2
+// NONE-NEXT: store half %a, ptr %a.addr, align 2
+// NONE-NEXT: store half %b, ptr %b.addr, align 2
+// NONE-NEXT: store half %c, ptr %c.addr, align 2
+// NONE-NEXT: store half %d, ptr %d.addr, align 2
+// NONE-NEXT: %0 = load half, ptr %a.addr, align 2
+// NONE-NEXT: %1 = load half, ptr %b.addr, align 2
+// NONE-NEXT: %mul = fmul half %0, %1
+// NONE-NEXT: %2 = load half, ptr %c.addr, align 2
+// NONE-NEXT: %3 = load half, ptr %d.addr, align 2
+// NONE-NEXT: %mul1 = fmul half %2, %3
+// NONE-NEXT: %add = fadd half %mul, %mul1
+// NONE-NEXT: ret half %add
+// NONE-NEXT: }
+
+// FAST-LABEL: define dso_local half @f(half noundef %a, half noundef %b, half noundef %c, half noundef %d) #0 {
+// FAST-NEXT: entry:
+// FAST-NEXT: %a.addr = alloca half, align 2
+// FAST-NEXT: %b.addr = alloca half, align 2
+// FAST-NEXT: %c.addr = alloca half, align 2
+// FAST-NEXT: %d.addr = alloca half, align 2
+// FAST-NEXT: store half %a, ptr %a.addr, align 2
+// FAST-NEXT: store half %b, ptr %b.addr, align 2
+// FAST-NEXT: store half %c, ptr %c.addr, align 2
+// FAST-NEXT: store half %d, ptr %d.addr, align 2
+// FAST-NEXT: %0 = load half, ptr %a.addr, align 2
+// FAST-NEXT: %ext = fpext half %0 to float
+// FAST-NEXT: %1 = load half, ptr %b.addr, align 2
+// FAST-NEXT: %ext1 = fpext half %1 to float
+// FAST-NEXT: %mul = fmul float %ext, %ext1
+// FAST-NEXT: %2 = load half, ptr %c.addr, align 2
+// FAST-NEXT: %ext2 = fpext half %2 to float
+// FAST-NEXT: %3 = load half, ptr %d.addr, align 2
+// FAST-NEXT: %ext3 = fpext half %3 to float
+// FAST-NEXT: %mul4 = fmul float %ext2, %ext3
+// FAST-NEXT: %add = fadd float %mul, %mul4
+// FAST-NEXT: %unpromotion = fptrunc float %add to half
+// FAST-NEXT: ret half %unpromotion
+// FAST-NEXT: }
diff --git a/clang/test/CodeGen/SystemZ/fp16.c b/clang/test/CodeGen/SystemZ/fp16.c
new file mode 100644
index 000000000000000..430958b69a177b5
--- /dev/null
+++ b/clang/test/CodeGen/SystemZ/fp16.c
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 -triple s390x-linux-gnu -emit-llvm -o - %s \
+// RUN: | FileCheck %s
+
+void f(__fp16 *a, __fp16 *b, __fp16 *c, __fp16 *d, __fp16 *e) {
+ *e = (*a) * (*b) + (*c) * (*d);
+}
+
+// CHECK-LABEL: define dso_local void @f(ptr noundef %a, ptr noundef %b, ptr noundef %c, ptr noundef %d, ptr noundef %e) #0 {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: %a.addr = alloca ptr, align 8
+// CHECK-NEXT: %b.addr = alloca ptr, align 8
+// CHECK-NEXT: %c.addr = alloca ptr, align 8
+// CHECK-NEXT: %d.addr = alloca ptr, align 8
+// CHECK-NEXT: %e.addr = alloca ptr, align 8
+// CHECK-NEXT: store ptr %a, ptr %a.addr, align 8
+// CHECK-NEXT: store ptr %b, ptr %b.addr, align 8
+// CHECK-NEXT: store ptr %c, ptr %c.addr, align 8
+// CHECK-NEXT: store ptr %d, ptr %d.addr, align 8
+// CHECK-NEXT: store ptr %e, ptr %e.addr, align 8
+// CHECK-NEXT: %0 = load ptr, ptr %a.addr, align 8
+// CHECK-NEXT: %1 = load half, ptr %0, align 2
+// CHECK-NEXT: %conv = fpext half %1 to float
+// CHECK-NEXT: %2 = load ptr, ptr %b.addr, align 8
+// CHECK-NEXT: %3 = load half, ptr %2, align 2
+// CHECK-NEXT: %conv1 = fpext half %3 to float
+// CHECK-NEXT: %mul = fmul float %conv, %conv1
+// CHECK-NEXT: %4 = load ptr, ptr %c.addr, align 8
+// CHECK-NEXT: %5 = load half, ptr %4, align 2
+// CHECK-NEXT: %conv2 = fpext half %5 to float
+// CHECK-NEXT: %6 = load ptr, ptr %d.addr, align 8
+// CHECK-NEXT: %7 = load half, ptr %6, align 2
+// CHECK-NEXT: %conv3 = fpext half %7 to float
+// CHECK-NEXT: %mul4 = fmul float %conv2, %conv3
+// CHECK-NEXT: %add = fadd float %mul, %mul4
+// CHECK-NEXT: %8 = fptrunc float %add to half
+// CHECK-NEXT: %9 = load ptr, ptr %e.addr, align 8
+// CHECK-NEXT: store half %8, ptr %9, align 2
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
diff --git a/clang/test/CodeGen/SystemZ/systemz-abi.c b/clang/test/CodeGen/SystemZ/systemz-abi.c
index fd2b5d450cc643e..2287126bdeabece 100644
--- a/clang/test/CodeGen/SystemZ/systemz-abi.c
+++ b/clang/test/CodeGen/SystemZ/systemz-abi.c
@@ -45,6 +45,9 @@ long long pass_longlong(long long arg) { return arg; }
__int128 pass_int128(__int128 arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_int128(ptr dead_on_unwind noalias writable sret(i128) align 8 %{{.*}}, ptr %0)
+_Float16 pass__Float16(_Float16 arg) { return arg; }
+// CHECK-LABEL: define{{.*}} half @pass__Float16(half %{{.*}})
+
float pass_float(float arg) { return arg; }
// CHECK-LABEL: define{{.*}} float @pass_float(float %{{.*}})
@@ -72,6 +75,9 @@ _Complex long pass_complex_long(_Complex long arg) { return arg; }
_Complex long long pass_complex_longlong(_Complex long long arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_complex_longlong(ptr dead_on_unwind noalias writable sret({ i64, i64 }) align 8 %{{.*}}, ptr %{{.*}}arg)
+_Complex _Float16 pass_complex__Float16(_Complex _Float16 arg) { return arg; }
+// CHECK-LABEL: define{{.*}} void @pass_complex__Float16(ptr dead_on_unwind noalias writable sret({ half, half }) align 2 %{{.*}}, ptr %{{.*}}arg)
+
_Complex float pass_complex_float(_Complex float arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_complex_float(ptr dead_on_unwind noalias writable sret({ float, float }) align 4 %{{.*}}, ptr %{{.*}}arg)
@@ -123,6 +129,11 @@ struct agg_16byte pass_agg_16byte(struct agg_16byte arg) { return arg; }
// Float-like aggregate types
+struct agg__Float16 { _Float16 a; };
+struct agg__Float16 pass_agg__Float16(struct agg__Float16 arg) { return arg; }
+// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, half %{{.*}})
+// SOFT-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, i16 noext %{{.*}})
+
struct agg_float { float a; };
struct agg_float pass_agg_float(struct agg_float arg) { return arg; }
// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg_float(ptr dead_on_unwind noalias writable sret(%struct.agg_float) align 4 %{{.*}}, float %{{.*}})
@@ -137,6 +148,11 @@ struct agg_longdouble { long double a; };
struct agg_longdouble pass_agg_longdouble(struct agg_longdouble arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_agg_longdouble(ptr dead_on_unwind noalias writable sret(%struct.agg_longdouble) align 8 %{{.*}}, ptr %{{.*}})
+struct agg__Float16_a8 { _Float16 a __attribute__((aligned (8))); };
+struct agg__Float16_a8 pass_agg__Float16_a8(struct agg__Float16_a8 arg) { return arg; }
+// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a8) align 8 %{{.*}}, double %{{.*}})
+// SOFT-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a8) align 8 %{{.*}}, i64 %{{.*}})
+
struct agg_float_a8 { float a __attribute__((aligned (8))); };
struct agg_float_a8 pass_agg_float_a8(struct agg_float_a8 arg) { return arg; }
// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg_float_a8(ptr dead_on_unwind noalias writable sret(%struct.agg_float_a8) align 8 %{{.*}}, double %{{.*}})
@@ -164,6 +180,10 @@ struct agg_nofloat3 pass_agg_nofloat3(struct agg_nofloat3 arg) { return arg; }
// Union types likewise are *not* float-like aggregate types
+union union__Float16 { _Float16 a; };
+union union__Float16 pass_union__Float16(union union__Float16 arg) { return arg; }
+// CHECK-LABEL: define{{.*}} void @pass_union__Float16(ptr dead_on_unwind noalias writable sret(%union.union__Float16) align 2 %{{.*}}, i16 noext %{{.*}})
+
union union_float { float a; };
union union_float pass_union_float(union union_float arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_union_float(ptr dead_on_unwind noalias writable sret(%union.union_float) align 4 %{{.*}}, i32 noext %{{.*}})
@@ -441,6 +461,30 @@ struct agg_8byte va_agg_8byte(__builtin_va_list l) { return __builtin_va_arg(l,
// CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi ptr [ [[RAW_REG_ADDR]], %{{.*}} ], [ [[RAW_MEM_ADDR]], %{{.*}} ]
// CHECK: ret void
+struct agg__Float16 va_agg__Float16(__builtin_va_list l) { return __builtin_va_arg(l, struct agg__Float16); }
+// CHECK-LABEL: define{{.*}} void @va_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, ptr %{{.*}}
+// HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 1
+// SOFT-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 0
+// CHECK: [[REG_COUNT:%[^ ]+]] = load i64, ptr [[REG_COUNT_PTR]]
+// HARD-FLOAT: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 4
+// SOFT-FLOAT: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5
+// CHECK: br i1 [[FITS_IN_REGS]],
+// CHECK: [[SCALED_REG_COUNT:%[^ ]+]] = mul i64 [[REG_COUNT]], 8
+// HARD-FLOAT: [[REG_OFFSET:%[^ ]+]] = add i64 [[SCALED_REG_COUNT]], 128
+// SOFT-FLOAT: [[REG_OFFSET:%[^ ]+]] = add i64 [[SCALED_REG_COUNT]], 22
+// CHECK: [[REG_SAVE_AREA_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 3
+// CHECK: [[REG_SAVE_AREA:%[^ ]+]] = load ptr, ptr [[REG_SAVE_AREA_PTR:[^ ]+]]
+// CHECK: [[RAW_REG_ADDR:%[^ ]+]] = getelementptr i8, ptr [[REG_SAVE_AREA]], i64 [[REG_OFFSET]]
+// CHECK: [[REG_COUNT1:%[^ ]+]] = add i64 [[REG_COUNT]], 1
+// CHECK: store i64 [[REG_COUNT1]], ptr [[REG_COUNT_PTR]]
+// CHECK: [[OVERFLOW_ARG_AREA_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 2
+// CHECK: [[OVERFLOW_ARG_AREA:%[^ ]+]] = load ptr, ptr [[OVERFLOW_ARG_AREA_PTR]]
+// CHECK: [[RAW_MEM_ADDR:%[^ ]+]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i64 6
+// CHECK: [[OVERFLOW_ARG_AREA2:%[^ ]+]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i64 8
+// CHECK: store ptr [[OVERFLOW_ARG_AREA2]], ptr [[OVERFLOW_ARG_AREA_PTR]]
+// CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi ptr [ [[RAW_REG_ADDR]], %{{.*}} ], [ [[RAW_MEM_ADDR]], %{{.*}} ]
+// CHECK: ret void
+
struct agg_float va_agg_float(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_float); }
// CHECK-LABEL: define{{.*}} void @va_agg_float(ptr dead_on_unwind noalias writable sret(%struct.agg_float) align 4 %{{.*}}, ptr %{{.*}}
// HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 1
diff --git a/compiler-rt/test/builtins/CMakeLists.txt b/compiler-rt/test/builtins/CMakeLists.txt
index 8fdcec6029a2a1a..63f4c94605c9075 100644
--- a/compiler-rt/test/builtins/CMakeLists.txt
+++ b/compiler-rt/test/builtins/CMakeLists.txt
@@ -56,7 +56,7 @@ foreach(arch ${BUILTIN_TEST_ARCH})
string(REPLACE ";" " " BUILTINS_TEST_TARGET_CFLAGS "${BUILTINS_TEST_TARGET_CFLAGS}")
endif()
else()
- if (${arch} MATCHES "arm|armhf|aarch64|arm64|i?86|x86_64|AMD64|riscv32|riscv64" AND COMPILER_RT_HAS_${arch}_FLOAT16)
+ if (${arch} MATCHES "arm|armhf|aarch64|arm64|i?86|x86_64|AMD64|riscv32|riscv64|s390x" AND COMPILER_RT_HAS_${arch}_FLOAT16)
list(APPEND BUILTINS_TEST_TARGET_CFLAGS -DCOMPILER_RT_HAS_FLOAT16)
string(REPLACE ";" " " BUILTINS_TEST_TARGET_CFLAGS "${BUILTINS_TEST_TARGET_CFLAGS}")
endif()
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index e38fce764b64033..7004da809d94997 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -255,4 +255,9 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
}
setLibcallName(RTLIB::MULO_I128, nullptr);
}
+
+ if (TT.isSystemZ()) {
+ setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
+ setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
+ }
}
diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index e4aefc42d860f21..7f528918850261d 100644
--- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -60,9 +60,11 @@ enum RegisterKind {
GRH32Reg,
GR64Reg,
GR128Reg,
+ FP16Reg,
FP32Reg,
FP64Reg,
FP128Reg,
+ VR16Reg,
VR32Reg,
VR64Reg,
VR128Reg,
@@ -356,9 +358,11 @@ class SystemZOperand : public MCParsedAsmOperand {
bool isADDR32() const { return isReg(GR32Reg); }
bool isADDR64() const { return isReg(GR64Reg); }
bool isADDR128() const { return false; }
+ bool isFP16() const { return isReg(FP16Reg); }
bool isFP32() const { return isReg(FP32Reg); }
bool isFP64() const { return isReg(FP64Reg); }
bool isFP128() const { return isReg(FP128Reg); }
+ bool isVR16() const { return isReg(VR16Reg); }
bool isVR32() const { return isReg(VR32Reg); }
bool isVR64() const { return isReg(VR64Reg); }
bool isVF128() const { return false; }
@@ -534,6 +538,9 @@ class SystemZAsmParser : public MCTargetAsmParser {
ParseStatus parseADDR128(OperandVector &Operands) {
llvm_unreachable("Shouldn't be used as an operand");
}
+ ParseStatus parseFP16(OperandVector &Operands) {
+ return parseRegister(Operands, FP16Reg);
+ }
ParseStatus parseFP32(OperandVector &Operands) {
return parseRegister(Operands, FP32Reg);
}
@@ -543,6 +550,9 @@ class SystemZAsmParser : public MCTargetAsmParser {
ParseStatus parseFP128(OperandVector &Operands) {
return parseRegister(Operands, FP128Reg);
}
+ ParseStatus parseVR16(OperandVector &Operands) {
+ return parseRegister(Operands, VR16Reg);
+ }
ParseStatus parseVR32(OperandVector &Operands) {
return parseRegister(Operands, VR32Reg);
}
@@ -829,11 +839,13 @@ ParseStatus SystemZAsmParser::parseRegister(OperandVector &Operands,
case GR128Reg:
Group = RegGR;
break;
+ case FP16Reg:
case FP32Reg:
case FP64Reg:
case FP128Reg:
Group = RegFP;
break;
+ case VR16Reg:
case VR32Reg:
case VR64Reg:
case VR128Reg:
@@ -882,9 +894,11 @@ ParseStatus SystemZAsmParser::parseRegister(OperandVector &Operands,
case GRH32Reg: Regs = SystemZMC::GRH32Regs; break;
case GR64Reg: Regs = SystemZMC::GR64Regs; break;
case GR128Reg: Regs = SystemZMC::GR128Regs; break;
+ case FP16Reg: Regs = SystemZMC::FP16Regs; break;
case FP32Reg: Regs = SystemZMC::FP32Regs; break;
case FP64Reg: Regs = SystemZMC::FP64Regs; break;
case FP128Reg: Regs = SystemZMC::FP128Regs; break;
+ case VR16Reg: Regs = SystemZMC::VR16Regs; break;
case VR32Reg: Regs = SystemZMC::VR32Regs; break;
case VR64Reg: Regs = SystemZMC::VR64Regs; break;
case VR128Reg: Regs = SystemZMC::VR128Regs; break;
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 333221c46ebb8bf..291b6789c78f697 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -61,6 +61,13 @@ const unsigned SystemZMC::GR128Regs[16] = {
SystemZ::R12Q, 0, SystemZ::R14Q, 0
};
+const unsigned SystemZMC::FP16Regs[16] = {
+ SystemZ::F0H, SystemZ::F1H, SystemZ::F2H, SystemZ::F3H,
+ SystemZ::F4H, SystemZ::F5H, SystemZ::F6H, SystemZ::F7H,
+ SystemZ::F8H, SystemZ::F9H, SystemZ::F10H, SystemZ::F11H,
+ SystemZ::F12H, SystemZ::F13H, SystemZ::F14H, SystemZ::F15H
+};
+
const unsigned SystemZMC::FP32Regs[16] = {
SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S,
SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S,
@@ -82,6 +89,17 @@ const unsigned SystemZMC::FP128Regs[16] = {
SystemZ::F12Q, SystemZ::F13Q, 0, 0
};
+const unsigned SystemZMC::VR16Regs[32] = {
+ SystemZ::F0H, SystemZ::F1H, SystemZ::F2H, SystemZ::F3H,
+ SystemZ::F4H, SystemZ::F5H, SystemZ::F6H, SystemZ::F7H,
+ SystemZ::F8H, SystemZ::F9H, SystemZ::F10H, SystemZ::F11H,
+ SystemZ::F12H, SystemZ::F13H, SystemZ::F14H, SystemZ::F15H,
+ SystemZ::F16H, SystemZ::F17H, SystemZ::F18H, SystemZ::F19H,
+ SystemZ::F20H, SystemZ::F21H, SystemZ::F22H, SystemZ::F23H,
+ SystemZ::F24H, SystemZ::F25H, SystemZ::F26H, SystemZ::F27H,
+ SystemZ::F28H, SystemZ::F29H, SystemZ::F30H, SystemZ::F31H
+};
+
const unsigned SystemZMC::VR32Regs[32] = {
SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S,
SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S,
@@ -142,6 +160,7 @@ unsigned SystemZMC::getFirstReg(unsigned Reg) {
Map[AR32Regs[I]] = I;
}
for (unsigned I = 0; I < 32; ++I) {
+ Map[VR16Regs[I]] = I;
Map[VR32Regs[I]] = I;
Map[VR64Regs[I]] = I;
Map[VR128Regs[I]] = I;
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
index 39c1836a137005c..1db1b4b9da0022a 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -43,9 +43,11 @@ extern const unsigned GR32Regs[16];
extern const unsigned GRH32Regs[16];
extern const unsigned GR64Regs[16];
extern const unsigned GR128Regs[16];
+extern const unsigned FP16Regs[16];
extern const unsigned FP32Regs[16];
extern const unsigned FP64Regs[16];
extern const unsigned FP128Regs[16];
+extern const unsigned VR16Regs[32];
extern const unsigned VR32Regs[32];
extern const unsigned VR64Regs[32];
extern const unsigned VR128Regs[32];
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 59154431877a880..8d4dc97f516824a 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -546,6 +546,10 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
lowerAlignmentHint(MI, LoweredMI, SystemZ::VSTMAlign);
break;
+ case SystemZ::VL16:
+ LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPH);
+ break;
+
case SystemZ::VL32:
LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPF);
break;
@@ -554,6 +558,10 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPG);
break;
+ case SystemZ::VST16:
+ LoweredMI = lowerSubvectorStore(MI, SystemZ::VSTEH);
+ break;
+
case SystemZ::VST32:
LoweredMI = lowerSubvectorStore(MI, SystemZ::VSTEF);
break;
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index 99bb697ce20142a..0ad872bcb63a749 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -50,6 +50,7 @@ def RetCC_SystemZ_ELF : CallingConv<[
// other floating-point argument registers available for code that
// doesn't care about the ABI. All floating-point argument registers
// are call-clobbered, so we can use all of them here.
+ CCIfType<[f16], CCAssignToReg<[F0H, F2H, F4H, F6H]>>,
CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
@@ -115,6 +116,7 @@ def CC_SystemZ_ELF : CallingConv<[
CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D, R6D]>>,
// The first 4 float and double arguments are passed in even registers F0-F6.
+ CCIfType<[f16], CCAssignToReg<[F0H, F2H, F4H, F6H]>>,
CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
@@ -138,7 +140,7 @@ def CC_SystemZ_ELF : CallingConv<[
CCAssignToStack<16, 8>>>,
// Other arguments are passed in 8-byte-aligned 8-byte stack slots.
- CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
+ CCIfType<[i32, i64, f16, f32, f64], CCAssignToStack<8, 8>>
]>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 403d238aa5b5282..e1ba46b08d0db97 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -1194,9 +1194,10 @@ void SystemZDAGToDAGISel::loadVectorConstant(
SDValue BitCast = CurDAG->getNode(ISD::BITCAST, DL, VT, Op);
ReplaceNode(Node, BitCast.getNode());
SelectCode(BitCast.getNode());
- } else { // float or double
- unsigned SubRegIdx =
- (VT.getSizeInBits() == 32 ? SystemZ::subreg_h32 : SystemZ::subreg_h64);
+ } else { // half, float or double
+ unsigned SubRegIdx = (VT.getSizeInBits() == 16 ? SystemZ::subreg_h16
+ : VT.getSizeInBits() == 32 ? SystemZ::subreg_h32
+ : SystemZ::subreg_h64);
ReplaceNode(
Node, CurDAG->getTargetExtractSubreg(SubRegIdx, DL, VT, Op).getNode());
}
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 8f505b7e198cfab..fb159236ec5c2b3 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -103,9 +103,11 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass);
if (!useSoftFloat()) {
if (Subtarget.hasVector()) {
+ addRegisterClass(MVT::f16, &SystemZ::VR16BitRegClass);
addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass);
addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass);
} else {
+ addRegisterClass(MVT::f16, &SystemZ::FP16BitRegClass);
addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
}
@@ -513,11 +515,24 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
}
// Handle floating-point types.
+ // Promote all f16 operations to float, with some exceptions below.
+ for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+ setOperationAction(Opc, MVT::f16, Promote);
+ setOperationAction(ISD::ConstantFP, MVT::f16, Expand);
+ for (MVT VT : {MVT::f32, MVT::f64, MVT::f128}) {
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
+ setTruncStoreAction(VT, MVT::f16, Expand);
+ }
+ for (auto Op : {ISD::LOAD, ISD::ATOMIC_LOAD, ISD::STORE, ISD::ATOMIC_STORE})
+ setOperationAction(Op, MVT::f16, Subtarget.hasVector() ? Legal : Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::f16, LibCall);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, LibCall);
+
for (unsigned I = MVT::FIRST_FP_VALUETYPE;
I <= MVT::LAST_FP_VALUETYPE;
++I) {
MVT VT = MVT::SimpleValueType(I);
- if (isTypeLegal(VT)) {
+ if (isTypeLegal(VT) && VT != MVT::f16) {
// We can use FI for FRINT.
setOperationAction(ISD::FRINT, VT, Legal);
@@ -549,7 +564,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STRICT_FSQRT, VT, Legal);
setOperationAction(ISD::STRICT_FRINT, VT, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal);
- setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
if (Subtarget.hasFPExtension()) {
setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
@@ -557,6 +571,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STRICT_FROUND, VT, Legal);
setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
}
+
+ // Extension from f16 needs libcall.
+ setOperationAction(ISD::FP_EXTEND, VT, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom);
}
}
@@ -766,6 +784,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
// Default to having -disable-strictnode-mutation on
IsStrictFPEnabled = true;
+ setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
+ setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
+
if (Subtarget.isTargetzOS()) {
struct RTLibCallMapping {
RTLIB::Libcall Code;
@@ -1656,6 +1677,10 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
NumFixedGPRs += 1;
RC = &SystemZ::GR64BitRegClass;
break;
+ case MVT::f16:
+ NumFixedFPRs += 1;
+ RC = &SystemZ::FP16BitRegClass;
+ break;
case MVT::f32:
NumFixedFPRs += 1;
RC = &SystemZ::FP32BitRegClass;
@@ -1700,9 +1725,12 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
// from this parameter. Unpromoted ints and floats are
// passed as right-justified 8-byte values.
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
- if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
+ if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32 ||
+ VA.getLocVT() == MVT::f16) {
+ unsigned SlotOffs = VA.getLocVT() == MVT::f16 ? 6 : 4;
FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
- DAG.getIntPtrConstant(4, DL));
+ DAG.getIntPtrConstant(SlotOffs, DL));
+ }
ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN,
MachinePointerInfo::getFixedStack(MF, FI));
}
@@ -2015,6 +2043,8 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
VA.getLocMemOffset();
if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
Offset += 4;
+ else if (VA.getLocVT() == MVT::f16)
+ Offset += 6;
SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
DAG.getIntPtrConstant(Offset, DL));
@@ -4562,6 +4592,22 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
return DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
}
+SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
+ SelectionDAG &DAG) const {
+ MVT RegVT = Op.getSimpleValueType();
+ if (RegVT.getSizeInBits() == 128)
+ return lowerATOMIC_LDST_I128(Op, DAG);
+ return lowerLoadF16(Op, DAG);
+}
+
+SDValue SystemZTargetLowering::lowerATOMIC_STORE(SDValue Op,
+ SelectionDAG &DAG) const {
+ auto *Node = cast<AtomicSDNode>(Op.getNode());
+ if (Node->getMemoryVT().getSizeInBits() == 128)
+ return lowerATOMIC_LDST_I128(Op, DAG);
+ return lowerStoreF16(Op, DAG);
+}
+
SDValue SystemZTargetLowering::lowerATOMIC_LDST_I128(SDValue Op,
SelectionDAG &DAG) const {
auto *Node = cast<AtomicSDNode>(Op.getNode());
@@ -6109,6 +6155,69 @@ static SDValue lowerAddrSpaceCast(SDValue Op, SelectionDAG &DAG) {
return Op;
}
+SDValue SystemZTargetLowering::lowerFP_EXTEND(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue In = Op.getOperand(Op->isStrictFPOpcode() ? 1 : 0);
+ if (In.getSimpleValueType() != MVT::f16)
+ return Op; // Legal
+ return SDValue(); // Let legalizer emit the libcall.
+}
+
+SDValue SystemZTargetLowering::lowerLoadF16(SDValue Op,
+ SelectionDAG &DAG) const {
+ MVT RegVT = Op.getSimpleValueType();
+ assert(RegVT == MVT::f16 && "Expected to lower an f16 load.");
+
+ SDLoc DL(Op);
+ SDValue NewLd;
+ if (auto *AtomicLd = dyn_cast<AtomicSDNode>(Op.getNode())) {
+ assert(EVT(RegVT) == AtomicLd->getMemoryVT() && "Unhandled f16 load");
+ NewLd = DAG.getAtomic(ISD::ATOMIC_LOAD, DL, MVT::i16, MVT::i32,
+ AtomicLd->getChain(), AtomicLd->getBasePtr(),
+ AtomicLd->getMemOperand());
+ cast<AtomicSDNode>(NewLd)->setExtensionType(ISD::EXTLOAD);
+ } else {
+ LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
+ assert(EVT(RegVT) == Ld->getMemoryVT() && "Unhandled f16 load");
+ NewLd = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Ld->getChain(),
+ Ld->getBasePtr(), Ld->getPointerInfo(),
+ MVT::i16, Ld->getOriginalAlign(),
+ Ld->getMemOperand()->getFlags());
+ }
+ // Load as integer, shift and then insert into upper 2 bytes of the FP
+ // register.
+ SDValue Shft = DAG.getNode(ISD::SHL, DL, MVT::i32, NewLd,
+ DAG.getConstant(16, DL, MVT::i32));
+ SDValue BCast = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Shft);
+ SDValue F16Val = DAG.getTargetExtractSubreg(SystemZ::subreg_h16,
+ DL, MVT::f16, BCast);
+ return DAG.getMergeValues({F16Val, NewLd.getValue(1)}, DL);
+}
+
+SDValue SystemZTargetLowering::lowerStoreF16(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue StoredVal = Op->getOperand(1);
+ MVT StoreVT = StoredVal.getSimpleValueType();
+ assert(StoreVT == MVT::f16 && "Expected to lower an f16 store.");
+
+ // Move into a GPR, shift and store the 2 bytes.
+ SDLoc DL(Op);
+ SDNode *U32 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f32);
+ SDValue In32 = DAG.getTargetInsertSubreg(SystemZ::subreg_h16, DL,
+ MVT::f32, SDValue(U32, 0), StoredVal);
+ SDValue BCast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, In32);
+ SDValue Shft = DAG.getNode(ISD::SRL, DL, MVT::i32, BCast,
+ DAG.getConstant(16, DL, MVT::i32));
+
+ if (auto *AtomicSt = dyn_cast<AtomicSDNode>(Op.getNode()))
+ return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MVT::i16, AtomicSt->getChain(),
+ Shft, AtomicSt->getBasePtr(), AtomicSt->getMemOperand());
+
+ StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
+ return DAG.getTruncStore(St->getChain(), DL, Shft, St->getBasePtr(),
+ MVT::i16, St->getMemOperand());
+}
+
SDValue SystemZTargetLowering::lowerIS_FPCLASS(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
@@ -6228,8 +6337,9 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
case ISD::ATOMIC_SWAP:
return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
case ISD::ATOMIC_STORE:
+ return lowerATOMIC_STORE(Op, DAG);
case ISD::ATOMIC_LOAD:
- return lowerATOMIC_LDST_I128(Op, DAG);
+ return lowerATOMIC_LOAD(Op, DAG);
case ISD::ATOMIC_LOAD_ADD:
return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD);
case ISD::ATOMIC_LOAD_SUB:
@@ -6286,6 +6396,13 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
return lowerAddrSpaceCast(Op, DAG);
case ISD::ROTL:
return lowerShift(Op, DAG, SystemZISD::VROTL_BY_SCALAR);
+ case ISD::FP_EXTEND:
+ case ISD::STRICT_FP_EXTEND:
+ return lowerFP_EXTEND(Op, DAG);
+ case ISD::LOAD:
+ return lowerLoadF16(Op, DAG);
+ case ISD::STORE:
+ return lowerStoreF16(Op, DAG);
case ISD::IS_FPCLASS:
return lowerIS_FPCLASS(Op, DAG);
case ISD::GET_ROUNDING:
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 3c06c1fdf2b1bca..3f54563039a9aeb 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -698,6 +698,8 @@ class SystemZTargetLowering : public TargetLowering {
SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVECREDUCE_ADD(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerATOMIC_LDST_I128(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG,
unsigned Opcode) const;
@@ -719,6 +721,10 @@ class SystemZTargetLowering : public TargetLowering {
SDValue lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
+ SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerLoadF16(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerStoreF16(SDValue Op, SelectionDAG &DAG) const;
+
SDValue lowerIS_FPCLASS(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
index aad04a2b4159cbf..5b4b73d586a7962 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -36,6 +36,8 @@ defm CondStoreF64 : CondStores<FP64, simple_store,
// Load zero.
let isAsCheapAsAMove = 1, isMoveImm = 1 in {
+ let isCodeGenOnly = 1 in
+ def LZER_16 : InherentRRE<"lzer", 0xB374, FP16, fpimm0>;
def LZER : InherentRRE<"lzer", 0xB374, FP32, fpimm0>;
def LZDR : InherentRRE<"lzdr", 0xB375, FP64, fpimm0>;
def LZXR : InherentRRE<"lzxr", 0xB376, FP128, fpimm0>;
@@ -47,8 +49,11 @@ def LDR : UnaryRR <"ldr", 0x28, null_frag, FP64, FP64>;
def LXR : UnaryRRE<"lxr", 0xB365, null_frag, FP128, FP128>;
// For z13 we prefer LDR over LER to avoid partial register dependencies.
-let isCodeGenOnly = 1 in
- def LDR32 : UnaryRR<"ldr", 0x28, null_frag, FP32, FP32>;
+let isCodeGenOnly = 1 in {
+ def LER16 : UnaryRR <"ler", 0x38, null_frag, FP16, FP16>;
+ def LDR16 : UnaryRR<"ldr", 0x28, null_frag, FP16, FP16>;
+ def LDR32 : UnaryRR<"ldr", 0x28, null_frag, FP32, FP32>;
+}
// Moves between two floating-point registers that also set the condition
// codes. Note that these instructions will turn SNaNs into QNaNs and should
@@ -331,8 +336,10 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
}
// Generic form, which does not set CC.
def LCDFR : UnaryRRE<"lcdfr", 0xB373, fneg, FP64, FP64>;
-let isCodeGenOnly = 1 in
+let isCodeGenOnly = 1 in {
+ def LCDFR_16 : UnaryRRE<"lcdfr", 0xB373, fneg, FP16, FP16>;
def LCDFR_32 : UnaryRRE<"lcdfr", 0xB373, fneg, FP32, FP32>;
+}
// Absolute value (Load Positive).
let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
@@ -592,6 +599,7 @@ let hasSideEffects = 1 in {
// Peepholes
//===----------------------------------------------------------------------===//
+def : Pat<(f16 fpimmneg0), (LCDFR_16 (LZER_16))>;
def : Pat<(f32 fpimmneg0), (LCDFR_32 (LZER))>;
def : Pat<(f64 fpimmneg0), (LCDFR (LZDR))>;
def : Pat<(f128 fpimmneg0), (LCXBR (LZXR))>;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index d553c72589f599d..470543824dc5d06 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -968,6 +968,8 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
unsigned Opcode;
if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg))
Opcode = SystemZ::LGR;
+ else if (SystemZ::FP16BitRegClass.contains(DestReg, SrcReg))
+ Opcode = STI.hasVector() ? SystemZ::LDR16 : SystemZ::LER16;
else if (SystemZ::FP32BitRegClass.contains(DestReg, SrcReg))
// For z13 we prefer LDR over LER to avoid partial register dependencies.
Opcode = STI.hasVector() ? SystemZ::LDR32 : SystemZ::LER;
@@ -994,8 +996,31 @@ void SystemZInstrInfo::storeRegToStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
bool isKill, int FrameIdx, const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI, Register VReg) const {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+ // Without vector support, there are no fp16 load/store instructions, so
+ // need to save/restore via GPR.
+ if (RC == &SystemZ::FP16BitRegClass && !STI.hasVector()) {
+ assert(!MRI.isSSA() && MRI.getNumVirtRegs() &&
+ "Expected non-SSA form with virtual registers.");
+ Register GR64Reg = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
+ Register FP64Reg = MRI.createVirtualRegister(&SystemZ::FP64BitRegClass);
+ BuildMI(MBB, MBBI, DL, get(SystemZ::COPY))
+ .addReg(FP64Reg, RegState::DefineNoRead, SystemZ::subreg_h16)
+ .addReg(SrcReg, getKillRegState(isKill));
+ BuildMI(MBB, MBBI, DL, get(SystemZ::LGDR), GR64Reg)
+ .addReg(FP64Reg, RegState::Kill);
+ BuildMI(MBB, MBBI, DL, get(SystemZ::SRLG), GR64Reg)
+ .addReg(GR64Reg)
+ .addReg(0)
+ .addImm(48);
+ addFrameReference(BuildMI(MBB, MBBI, DL, get(SystemZ::STH))
+ .addReg(GR64Reg, RegState::Kill, SystemZ::subreg_l32),
+ FrameIdx);
+ return;
+ }
+
// Callers may expect a single instruction, so keep 128-bit moves
// together for now and lower them after register allocation.
unsigned LoadOpcode, StoreOpcode;
@@ -1011,8 +1036,31 @@ void SystemZInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI,
Register VReg) const {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+ // Without vector support, there are no fp16 load/store instructions, so
+ // need to save/restore via GPR.
+ if (RC == &SystemZ::FP16BitRegClass && !STI.hasVector()) {
+ assert(!MRI.isSSA() && MRI.getNumVirtRegs() &&
+ "Expected non-SSA form with virtual registers.");
+ Register GR64Reg = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
+ Register FP64Reg = MRI.createVirtualRegister(&SystemZ::FP64BitRegClass);
+ addFrameReference(BuildMI(MBB, MBBI, DL, get(SystemZ::LH))
+ .addReg(GR64Reg, RegState::DefineNoRead,
+ SystemZ::subreg_l32),
+ FrameIdx);
+ BuildMI(MBB, MBBI, DL, get(SystemZ::SLLG), GR64Reg)
+ .addReg(GR64Reg)
+ .addReg(0)
+ .addImm(48);
+ BuildMI(MBB, MBBI, DL, get(SystemZ::LDGR), FP64Reg)
+ .addReg(GR64Reg, RegState::Kill);
+ BuildMI(MBB, MBBI, DL, get(SystemZ::COPY), DestReg)
+ .addReg(FP64Reg, RegState::Kill, SystemZ::subreg_h16);
+ return;
+ }
+
// Callers may expect a single instruction, so keep 128-bit moves
// together for now and lower them after register allocation.
unsigned LoadOpcode, StoreOpcode;
@@ -1883,6 +1931,10 @@ void SystemZInstrInfo::getLoadStoreOpcodes(const TargetRegisterClass *RC,
} else if (RC == &SystemZ::FP128BitRegClass) {
LoadOpcode = SystemZ::LX;
StoreOpcode = SystemZ::STX;
+ } else if (RC == &SystemZ::FP16BitRegClass ||
+ RC == &SystemZ::VR16BitRegClass) {
+ LoadOpcode = SystemZ::VL16;
+ StoreOpcode = SystemZ::VST16;
} else if (RC == &SystemZ::VR32BitRegClass) {
LoadOpcode = SystemZ::VL32;
StoreOpcode = SystemZ::VST32;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index c09f48891c13916..7b6e4deed18ef64 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -140,6 +140,7 @@ let Predicates = [FeatureVector] in {
// to use those instructions rather than force a 20-bit displacement
// into a GPR temporary.
let mayLoad = 1, canFoldAsLoad = 1 in {
+ def VL16 : UnaryAliasVRX<z_load, v16hb, bdxaddr12pair>;
def VL32 : UnaryAliasVRX<z_load, v32sb, bdxaddr12pair>;
def VL64 : UnaryAliasVRX<z_load, v64db, bdxaddr12pair>;
}
@@ -236,6 +237,7 @@ let Predicates = [FeatureVector] in {
// to use those instructions rather than force a 20-bit displacement
// into a GPR temporary.
let mayStore = 1 in {
+ def VST16 : StoreAliasVRX<store, v16hb, bdxaddr12pair>;
def VST32 : StoreAliasVRX<store, v32sb, bdxaddr12pair>;
def VST64 : StoreAliasVRX<store, v64db, bdxaddr12pair>;
}
diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
index 8f9bb56f2eb3bbb..1dfe264b501b1c4 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -20,6 +20,7 @@ class SystemZRegWithSubregs<string n, list<Register> subregs>
}
let Namespace = "SystemZ" in {
+def subreg_h16 : SubRegIndex<16, 16>;
def subreg_l32 : SubRegIndex<32, 0>; // Also acts as subreg_hl32.
def subreg_h32 : SubRegIndex<32, 32>; // Also acts as subreg_hh32.
def subreg_l64 : SubRegIndex<64, 0>;
@@ -201,9 +202,16 @@ def F27Dwarf : DwarfMapping<81>;
def F29Dwarf : DwarfMapping<82>;
def F31Dwarf : DwarfMapping<83>;
+// Upper 16 bits of one of the floating-point registers
+class FPR16<bits<16> num, string n> : SystemZReg<n> {
+ let HWEncoding = num;
+}
+
// Upper 32 bits of one of the floating-point registers
-class FPR32<bits<16> num, string n> : SystemZReg<n> {
+class FPR32<bits<16> num, string n, FPR16 high>
+ : SystemZRegWithSubregs<n, [high]> {
let HWEncoding = num;
+ let SubRegIndices = [subreg_h16];
}
// One of the floating-point registers.
@@ -223,12 +231,14 @@ class FPR128<bits<16> num, string n, FPR64 low, FPR64 high>
// Floating-point registers. Registers 16-31 require the vector facility.
foreach I = 0-15 in {
- def F#I#S : FPR32<I, "f"#I>;
+ def F#I#H : FPR16<I, "f"#I>;
+ def F#I#S : FPR32<I, "f"#I, !cast<FPR16>("F"#I#"H")>;
def F#I#D : FPR64<I, "f"#I, !cast<FPR32>("F"#I#"S")>,
DwarfRegNum<[!cast<DwarfMapping>("F"#I#"Dwarf").Id]>;
}
foreach I = 16-31 in {
- def F#I#S : FPR32<I, "v"#I>;
+ def F#I#H : FPR16<I, "v"#I>;
+ def F#I#S : FPR32<I, "v"#I, !cast<FPR16>("F"#I#"H")>;
def F#I#D : FPR64<I, "v"#I, !cast<FPR32>("F"#I#"S")>,
DwarfRegNum<[!cast<DwarfMapping>("F"#I#"Dwarf").Id]>;
}
@@ -240,6 +250,7 @@ foreach I = [0, 1, 4, 5, 8, 9, 12, 13] in {
// There's no store-multiple instruction for FPRs, so we're not fussy
// about the order in which call-saved registers are allocated.
+defm FP16 : SystemZRegClass<"FP16", [f16], 16, (sequence "F%uH", 0, 15)>;
defm FP32 : SystemZRegClass<"FP32", [f32], 32, (sequence "F%uS", 0, 15)>;
defm FP64 : SystemZRegClass<"FP64", [f64], 64, (sequence "F%uD", 0, 15)>;
defm FP128 : SystemZRegClass<"FP128", [f128], 128,
@@ -262,6 +273,13 @@ foreach I = 0-31 in {
DwarfRegNum<[!cast<DwarfMapping>("F"#I#"Dwarf").Id]>;
}
+// Class used to store 16-bit fp values in the first element of a vector
+// register.
+defm VR16 : SystemZRegClass<"VR16", [f16], 16,
+ (add (sequence "F%uH", 0, 7),
+ (sequence "F%uH", 16, 31),
+ (sequence "F%uH", 8, 15))>;
+
// Class used to store 32-bit values in the first element of a vector
// register. f32 scalars are used for the WLEDB and WLDEB instructions.
defm VR32 : SystemZRegClass<"VR32", [f32, v4i8, v2i16], 32,
@@ -298,6 +316,7 @@ class TypedReg<ValueType vtin, RegisterOperand opin> {
RegisterOperand op = opin;
}
+def v16hb : TypedReg<f16, VR16>;
def v32f : TypedReg<i32, VR32>;
def v32sb : TypedReg<f32, VR32>;
def v64g : TypedReg<i64, VR64>;
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
index d0fec02777875ac..6c1d1df83fafa3f 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -773,12 +773,12 @@ def : InstRW<[], (instregex "Insn.*")>;
//===----------------------------------------------------------------------===//
// Load zero
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER|ER_16)$")>;
def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>;
// Load
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER(16)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R16|R32|GR)$")>;
def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>;
def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>;
@@ -840,7 +840,7 @@ def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>;
// Load Complement / Negative / Positive
def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32|_16)?$")>;
def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>;
// Square root
@@ -1191,7 +1191,7 @@ def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(16|32|64)$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H)?$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>;
def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
@@ -1205,7 +1205,7 @@ def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone],
// Vector: Stores
//===----------------------------------------------------------------------===//
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|16|32|64)?$")>;
def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>;
def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>;
def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>;
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
index a6d89ce9443c5aa..c47fcb7cb0a11b9 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
@@ -793,12 +793,12 @@ def : InstRW<[], (instregex "Insn.*")>;
//===----------------------------------------------------------------------===//
// Load zero
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER|ER_16)$")>;
def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>;
// Load
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER(16)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R16|R32|GR)$")>;
def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>;
def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>;
@@ -860,7 +860,7 @@ def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>;
// Load Complement / Negative / Positive
def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32|_16)?$")>;
def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>;
// Square root
@@ -1209,7 +1209,7 @@ def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(16|32|64)$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H|LF)?$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>;
def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
@@ -1224,7 +1224,7 @@ def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLRL(R)?$")>;
// Vector: Stores
//===----------------------------------------------------------------------===//
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|16|32|64)?$")>;
def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>;
def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>;
def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>;
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
index 455354e283ad8ec..28d34d80adb812a 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
@@ -811,12 +811,12 @@ def : InstRW<[], (instregex "Insn.*")>;
//===----------------------------------------------------------------------===//
// Load zero
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER|ER_16)$")>;
def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>;
// Load
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER(16)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R16|R32|GR)$")>;
def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>;
def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>;
@@ -878,7 +878,7 @@ def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>;
// Load Complement / Negative / Positive
def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32|_16)?$")>;
def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>;
// Square root
@@ -1231,7 +1231,7 @@ def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(16|32|64)$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H|LF)?$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>;
def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
@@ -1246,7 +1246,7 @@ def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLRL(R)?$")>;
// Vector: Stores
//===----------------------------------------------------------------------===//
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|16|32|64)?$")>;
def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>;
def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>;
def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>;
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
index 92abf0ba4022cc2..24713b8fc93b56f 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
@@ -812,12 +812,12 @@ def : InstRW<[], (instregex "Insn.*")>;
//===----------------------------------------------------------------------===//
// Load zero
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER|ER_16)$")>;
def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>;
// Load
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER(16)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R16|R32|GR)$")>;
def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>;
def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>;
@@ -879,7 +879,7 @@ def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>;
// Load Complement / Negative / Positive
def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32|_16)?$")>;
def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>;
// Square root
@@ -1237,7 +1237,7 @@ def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(16|32|64)$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H|LF)?$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>;
def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
@@ -1252,7 +1252,7 @@ def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLRL(R)?$")>;
// Vector: Stores
//===----------------------------------------------------------------------===//
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|16|32|64)?$")>;
def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>;
def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>;
def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>;
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
index 99d0d674bbbb2fd..e93f329fb286f49 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -705,12 +705,12 @@ def : InstRW<[], (instregex "Insn.*")>;
//===----------------------------------------------------------------------===//
// Load zero
-def : InstRW<[WLat1, FXU, NormalGr], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LZ(DR|ER|ER_16)$")>;
def : InstRW<[WLat2, FXU2, GroupAlone2], (instregex "LZXR$")>;
// Load
-def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER$")>;
-def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER(16)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R16|R32|GR)$")>;
def : InstRW<[WLat3, FXU, NormalGr], (instregex "LGDR$")>;
def : InstRW<[WLat2, FXU2, GroupAlone2], (instregex "LXR$")>;
@@ -771,7 +771,7 @@ def : InstRW<[WLat12, WLat12, FXU, FPU2, GroupAlone], (instregex "CL(F|G)XBR$")>
// Load Complement / Negative / Positive
def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>;
-def : InstRW<[WLat1, FXU, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "L(C|N|P)DFR(_32|_16)?$")>;
def : InstRW<[WLat10, WLat10, FPU4, GroupAlone], (instregex "L(C|N|P)XBR$")>;
// Square root
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td b/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
index 5b334da2bac3429..95dfab6c476bf32 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -743,12 +743,12 @@ def : InstRW<[], (instregex "Insn.*")>;
//===----------------------------------------------------------------------===//
// Load zero
-def : InstRW<[WLat1, FXU, NormalGr], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LZ(DR|ER|ER_16)$")>;
def : InstRW<[WLat2, FXU2, GroupAlone], (instregex "LZXR$")>;
// Load
-def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER$")>;
-def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER(16)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R16|R32|GR)$")>;
def : InstRW<[WLat3, FXU, NormalGr], (instregex "LGDR$")>;
def : InstRW<[WLat2, FXU2, GroupAlone], (instregex "LXR$")>;
@@ -809,7 +809,7 @@ def : InstRW<[WLat12, WLat12, FXU, FPU2, GroupAlone], (instregex "CL(F|G)XBR$")>
// Load Complement / Negative / Positive
def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>;
-def : InstRW<[WLat1, FXU, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "L(C|N|P)DFR(_32|_16)?$")>;
def : InstRW<[WLat10, WLat10, FPU4, GroupAlone], (instregex "L(C|N|P)XBR$")>;
// Square root
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-10.ll b/llvm/test/CodeGen/SystemZ/atomic-load-10.ll
new file mode 100644
index 000000000000000..e30f9791b51e021
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-10.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test fp16 atomic loads.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs -mcpu=z16 | FileCheck %s -check-prefix=VECTOR
+
+define half @f1(ptr %src) {
+; CHECK-LABEL: f1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lh %r0, 0(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f0, %r0
+; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; CHECK-NEXT: br %r14
+;
+; VECTOR-LABEL: f1:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: vlreph %v0, 0(%r2)
+; VECTOR-NEXT: br %r14
+ %val = load atomic half, ptr %src seq_cst, align 2
+ ret half %val
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-10.ll b/llvm/test/CodeGen/SystemZ/atomic-store-10.ll
new file mode 100644
index 000000000000000..3f228d58dcd8ce3
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-10.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test half atomic stores.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs -mcpu=z16 | FileCheck %s -check-prefix=VECTOR
+
+define void @f1(ptr %src, half %val) {
+; CHECK-LABEL: f1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d
+; CHECK-NEXT: lgdr %r0, %f0
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 0(%r2)
+; CHECK-NEXT: bcr 15, %r0
+; CHECK-NEXT: br %r14
+;
+; VECTOR-LABEL: f1:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: vsteh %v0, 0(%r2), 0
+; VECTOR-NEXT: bcr 14, %r0
+; VECTOR-NEXT: br %r14
+ store atomic half %val, ptr %src seq_cst, align 2
+ ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-libcall.ll b/llvm/test/CodeGen/SystemZ/fp-half-libcall.ll
new file mode 100644
index 000000000000000..6e813a4a5094d76
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-half-libcall.ll
@@ -0,0 +1,312 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test that library calls are emitted for LLVM IR intrinsics
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define half @f1(half %x, i16 %y) {
+; CHECK-LABEL: f1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
+; CHECK-NEXT: .cfi_offset %r13, -56
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: lhr %r13, %r2
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: llgfr %r2, %r13
+; CHECK-NEXT: brasl %r14, __powisf2 at PLT
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: lmg %r13, %r15, 264(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call half @llvm.powi.f16.i16(half %x, i16 %y)
+ ret half %tmp
+}
+
+define half @f2(half %x, half %y) {
+; CHECK-LABEL: f2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -176
+; CHECK-NEXT: .cfi_def_cfa_offset 336
+; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: .cfi_offset %f9, -176
+; CHECK-NEXT: ler %f8, %f2
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f9, %f0
+; CHECK-NEXT: ler %f0, %f8
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f2, %f0
+; CHECK-NEXT: ler %f0, %f9
+; CHECK-NEXT: brasl %r14, powf at PLT
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: lmg %r14, %r15, 288(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call half @llvm.pow.f16(half %x, half %y)
+ ret half %tmp
+}
+
+define half @f3(half %x) {
+; CHECK-LABEL: f3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: brasl %r14, sinf at PLT
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: lmg %r14, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call half @llvm.sin.f16(half %x)
+ ret half %tmp
+}
+
+define half @f4(half %x) {
+; CHECK-LABEL: f4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: brasl %r14, cosf at PLT
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: lmg %r14, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call half @llvm.cos.f16(half %x)
+ ret half %tmp
+}
+
+define half @f5(half %x) {
+; CHECK-LABEL: f5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: brasl %r14, expf at PLT
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: lmg %r14, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call half @llvm.exp.f16(half %x)
+ ret half %tmp
+}
+
+define half @f6(half %x) {
+; CHECK-LABEL: f6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: brasl %r14, exp2f at PLT
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: lmg %r14, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call half @llvm.exp2.f16(half %x)
+ ret half %tmp
+}
+
+define half @f7(half %x) {
+; CHECK-LABEL: f7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: brasl %r14, logf at PLT
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: lmg %r14, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call half @llvm.log.f16(half %x)
+ ret half %tmp
+}
+
+define half @f8(half %x) {
+; CHECK-LABEL: f8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: brasl %r14, log2f at PLT
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: lmg %r14, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call half @llvm.log2.f16(half %x)
+ ret half %tmp
+}
+
+define half @f9(half %x) {
+; CHECK-LABEL: f9:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: brasl %r14, log10f at PLT
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: lmg %r14, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call half @llvm.log10.f16(half %x)
+ ret half %tmp
+}
+
+define half @f10(half %x, half %y) {
+; CHECK-LABEL: f10:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -176
+; CHECK-NEXT: .cfi_def_cfa_offset 336
+; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: .cfi_offset %f9, -176
+; CHECK-NEXT: ler %f8, %f2
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f9, %f0
+; CHECK-NEXT: ler %f0, %f8
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f2, %f0
+; CHECK-NEXT: ler %f0, %f9
+; CHECK-NEXT: brasl %r14, fminf at PLT
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: lmg %r14, %r15, 288(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call half @llvm.minnum.f16(half %x, half %y)
+ ret half %tmp
+}
+
+define half @f11(half %x, half %y) {
+; CHECK-LABEL: f11:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -176
+; CHECK-NEXT: .cfi_def_cfa_offset 336
+; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: .cfi_offset %f9, -176
+; CHECK-NEXT: ler %f8, %f2
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f9, %f0
+; CHECK-NEXT: ler %f0, %f8
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f2, %f0
+; CHECK-NEXT: ler %f0, %f9
+; CHECK-NEXT: brasl %r14, fmaxf at PLT
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: lmg %r14, %r15, 288(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call half @llvm.maxnum.f16(half %x, half %y)
+ ret half %tmp
+}
+
+; Verify that "nnan" minnum/maxnum calls are transformed to
+; compare+select sequences instead of libcalls.
+define half @f12(half %x, half %y) {
+; CHECK-LABEL: f12:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -176
+; CHECK-NEXT: .cfi_def_cfa_offset 336
+; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: .cfi_offset %f9, -176
+; CHECK-NEXT: ler %f9, %f0
+; CHECK-NEXT: ler %f0, %f2
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f8, %f0
+; CHECK-NEXT: ler %f0, %f9
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: cebr %f0, %f8
+; CHECK-NEXT: jl .LBB11_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ler %f0, %f8
+; CHECK-NEXT: .LBB11_2:
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: lmg %r14, %r15, 288(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call nnan half @llvm.minnum.f16(half %x, half %y)
+ ret half %tmp
+}
+
+define half @f13(half %x, half %y) {
+; CHECK-LABEL: f13:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -176
+; CHECK-NEXT: .cfi_def_cfa_offset 336
+; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: .cfi_offset %f9, -176
+; CHECK-NEXT: ler %f9, %f0
+; CHECK-NEXT: ler %f0, %f2
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f8, %f0
+; CHECK-NEXT: ler %f0, %f9
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: cebr %f0, %f8
+; CHECK-NEXT: jh .LBB12_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ler %f0, %f8
+; CHECK-NEXT: .LBB12_2:
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: lmg %r14, %r15, 288(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call nnan half @llvm.maxnum.f16(half %x, half %y)
+ ret half %tmp
+}
+
+declare half @llvm.powi.f16.i16(half, i16)
+declare half @llvm.pow.f16(half, half)
+
+declare half @llvm.sin.f16(half)
+declare half @llvm.cos.f16(half)
+
+declare half @llvm.exp.f16(half)
+declare half @llvm.exp2.f16(half)
+
+declare half @llvm.log.f16(half)
+declare half @llvm.log2.f16(half)
+declare half @llvm.log10.f16(half)
+
+declare half @llvm.minnum.f16(half, half)
+declare half @llvm.maxnum.f16(half, half)
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-strict.ll b/llvm/test/CodeGen/SystemZ/fp-half-strict.ll
new file mode 100644
index 000000000000000..42663b109d7a9f7
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-half-strict.ll
@@ -0,0 +1,209 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=NOVEC
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=VECTOR
+;
+; Tests for strict 16-bit floating point (half).
+
+declare half @llvm.experimental.constrained.fadd.f16(half, half, metadata, metadata)
+declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata)
+declare half @llvm.experimental.constrained.fmul.f16(half, half, metadata, metadata)
+
+; Test register addition.
+define half @fun0(half %f1, half %f2) #0 {
+; NOVEC-LABEL: fun0:
+; NOVEC: # %bb.0:
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -176
+; NOVEC-NEXT: .cfi_def_cfa_offset 336
+; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: ler %f8, %f0
+; NOVEC-NEXT: ler %f0, %f2
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f9
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 288(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun0:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -176
+; VECTOR-NEXT: .cfi_def_cfa_offset 336
+; VECTOR-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: ldr %f0, %f2
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f9
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 288(%r15)
+; VECTOR-NEXT: br %r14
+ %res = call half @llvm.experimental.constrained.fadd.f16(
+ half %f1, half %f2,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret half %res
+}
+
+; Test atomic memory accesses and extension/truncation inside a strictfp
+; function.
+define void @fun1(ptr %Src, ptr %Dst) #0 {
+; NOVEC-LABEL: fun1:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r13, %r15, 104(%r15)
+; NOVEC-NEXT: .cfi_offset %r13, -56
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -160
+; NOVEC-NEXT: .cfi_def_cfa_offset 320
+; NOVEC-NEXT: lh %r0, 0(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: lgr %r13, %r3
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: brasl %r14, __extendhfdf2 at PLT
+; NOVEC-NEXT: adbr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncdfhf2 at PLT
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r13)
+; NOVEC-NEXT: bcr 14, %r0
+; NOVEC-NEXT: lmg %r13, %r15, 264(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun1:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: vlreph %v0, 0(%r2)
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: brasl %r14, __extendhfdf2 at PLT
+; VECTOR-NEXT: adbr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncdfhf2 at PLT
+; VECTOR-NEXT: vsteh %v0, 0(%r13), 0
+; VECTOR-NEXT: bcr 14, %r0
+; VECTOR-NEXT: lmg %r13, %r15, 264(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %Op0 = load atomic half, ptr %Src seq_cst, align 2
+ %E0 = fpext half %Op0 to double
+ %Add = call double @llvm.experimental.constrained.fadd.f64(
+ double %E0, double %E0,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ %Res = fptrunc double %Add to half
+ store atomic half %Res, ptr %Dst seq_cst, align 2
+ ret void
+}
+
+; Test a chain of half operations which should have each operation surrounded
+; by conversions to/from fp32 for proper emulation.
+define half @fun2(half %Op0, half %Op1, half %Op2) #0 {
+; NOVEC-LABEL: fun2:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -184
+; NOVEC-NEXT: .cfi_def_cfa_offset 344
+; NOVEC-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f10, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: .cfi_offset %f10, -184
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f2
+; NOVEC-NEXT: ler %f8, %f4
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ler %f10, %f0
+; NOVEC-NEXT: ler %f0, %f9
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: meebr %f0, %f10
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: meebr %f0, %f9
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 296(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun2:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -184
+; VECTOR-NEXT: .cfi_def_cfa_offset 344
+; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 168(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f10, 160(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: .cfi_offset %f10, -184
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f2
+; VECTOR-NEXT: ldr %f8, %f4
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f10, %f0
+; VECTOR-NEXT: ldr %f0, %f9
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: meebr %f0, %f10
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: wfmsb %f0, %f9, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 296(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %A0 = call half @llvm.experimental.constrained.fmul.f16(
+ half %Op0, half %Op1,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ %Res = call half @llvm.experimental.constrained.fmul.f16(
+ half %A0, half %Op2,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret half %Res
+}
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector.ll
new file mode 100644
index 000000000000000..cc3f61f9986494a
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-half-vector.ll
@@ -0,0 +1,797 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=NOVEC
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=VECTOR
+
+; Add the <8 x half> argument with itself and return it.
+define <8 x half> @fun0(<8 x half> %Op) {
+; NOVEC-LABEL: fun0:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r13, %r15, 104(%r15)
+; NOVEC-NEXT: .cfi_offset %r13, -56
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -224
+; NOVEC-NEXT: .cfi_def_cfa_offset 384
+; NOVEC-NEXT: std %f8, 216(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 208(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f10, 200(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f11, 192(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f12, 184(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f13, 176(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f14, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f15, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: .cfi_offset %f10, -184
+; NOVEC-NEXT: .cfi_offset %f11, -192
+; NOVEC-NEXT: .cfi_offset %f12, -200
+; NOVEC-NEXT: .cfi_offset %f13, -208
+; NOVEC-NEXT: .cfi_offset %f14, -216
+; NOVEC-NEXT: .cfi_offset %f15, -224
+; NOVEC-NEXT: lh %r0, 414(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f15, %r0
+; NOVEC-NEXT: lh %r0, 406(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f12, %r0
+; NOVEC-NEXT: lh %r0, 398(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f9, %r0
+; NOVEC-NEXT: lh %r0, 390(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ler %f10, %f6
+; NOVEC-NEXT: ler %f11, %f4
+; NOVEC-NEXT: ler %f13, %f2
+; NOVEC-NEXT: ler %f14, %f0
+; NOVEC-NEXT: lgr %r13, %r2
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f8, %f0
+; NOVEC-NEXT: ler %f0, %f9
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f12
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f12, %f0
+; NOVEC-NEXT: ler %f0, %f15
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f15, %f0
+; NOVEC-NEXT: ler %f0, %f14
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f14, %f0
+; NOVEC-NEXT: ler %f0, %f13
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f13, %f0
+; NOVEC-NEXT: ler %f0, %f11
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f11, %f0
+; NOVEC-NEXT: ler %f0, %f10
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 6(%r13)
+; NOVEC-NEXT: lgdr %r0, %f11
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 4(%r13)
+; NOVEC-NEXT: lgdr %r0, %f13
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 2(%r13)
+; NOVEC-NEXT: lgdr %r0, %f14
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r13)
+; NOVEC-NEXT: lgdr %r0, %f15
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 14(%r13)
+; NOVEC-NEXT: lgdr %r0, %f12
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 12(%r13)
+; NOVEC-NEXT: lgdr %r0, %f9
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 10(%r13)
+; NOVEC-NEXT: lgdr %r0, %f8
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 8(%r13)
+; NOVEC-NEXT: ld %f8, 216(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 208(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f10, 200(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f11, 192(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f12, 184(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f13, 176(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f14, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f15, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r13, %r15, 328(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun0:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -224
+; VECTOR-NEXT: .cfi_def_cfa_offset 384
+; VECTOR-NEXT: std %f8, 216(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 208(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f10, 200(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f11, 192(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f12, 184(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f13, 176(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f14, 168(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f15, 160(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: .cfi_offset %f10, -184
+; VECTOR-NEXT: .cfi_offset %f11, -192
+; VECTOR-NEXT: .cfi_offset %f12, -200
+; VECTOR-NEXT: .cfi_offset %f13, -208
+; VECTOR-NEXT: .cfi_offset %f14, -216
+; VECTOR-NEXT: .cfi_offset %f15, -224
+; VECTOR-NEXT: vlreph %v11, 414(%r15)
+; VECTOR-NEXT: vlreph %v12, 406(%r15)
+; VECTOR-NEXT: vlreph %v13, 398(%r15)
+; VECTOR-NEXT: vlreph %v14, 390(%r15)
+; VECTOR-NEXT: ldr %f8, %f6
+; VECTOR-NEXT: ldr %f9, %f4
+; VECTOR-NEXT: ldr %f10, %f2
+; VECTOR-NEXT: lgr %r13, %r2
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f15, %f0
+; VECTOR-NEXT: ldr %f0, %f10
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f10, %f0
+; VECTOR-NEXT: ldr %f0, %f9
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: ldr %f0, %f14
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f14, %f0
+; VECTOR-NEXT: ldr %f0, %f13
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f13, %f0
+; VECTOR-NEXT: ldr %f0, %f12
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f12, %f0
+; VECTOR-NEXT: ldr %f0, %f11
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: vsteh %v0, 14(%r13), 0
+; VECTOR-NEXT: vsteh %v12, 12(%r13), 0
+; VECTOR-NEXT: vsteh %v13, 10(%r13), 0
+; VECTOR-NEXT: vsteh %v14, 8(%r13), 0
+; VECTOR-NEXT: vsteh %v8, 6(%r13), 0
+; VECTOR-NEXT: vsteh %v9, 4(%r13), 0
+; VECTOR-NEXT: vsteh %v10, 2(%r13), 0
+; VECTOR-NEXT: vsteh %v15, 0(%r13), 0
+; VECTOR-NEXT: ld %f8, 216(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 208(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f10, 200(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f11, 192(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f12, 184(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f13, 176(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f14, 168(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f15, 160(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r13, %r15, 328(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %Res = fadd <8 x half> %Op, %Op
+ ret <8 x half> %Res
+}
+
+; Same, but with partial vector values.
+define <4 x half> @fun1(<4 x half> %Op) {
+; NOVEC-LABEL: fun1:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -192
+; NOVEC-NEXT: .cfi_def_cfa_offset 352
+; NOVEC-NEXT: std %f8, 184(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 176(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f10, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f11, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: .cfi_offset %f10, -184
+; NOVEC-NEXT: .cfi_offset %f11, -192
+; NOVEC-NEXT: ler %f8, %f6
+; NOVEC-NEXT: ler %f9, %f4
+; NOVEC-NEXT: ler %f10, %f2
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f11, %f0
+; NOVEC-NEXT: ler %f0, %f10
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f10, %f0
+; NOVEC-NEXT: ler %f0, %f9
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f6, %f0
+; NOVEC-NEXT: ler %f0, %f11
+; NOVEC-NEXT: ler %f2, %f10
+; NOVEC-NEXT: ler %f4, %f9
+; NOVEC-NEXT: ld %f8, 184(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 176(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f10, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f11, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 304(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun1:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -192
+; VECTOR-NEXT: .cfi_def_cfa_offset 352
+; VECTOR-NEXT: std %f8, 184(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 176(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f10, 168(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f11, 160(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: .cfi_offset %f10, -184
+; VECTOR-NEXT: .cfi_offset %f11, -192
+; VECTOR-NEXT: ldr %f8, %f6
+; VECTOR-NEXT: ldr %f9, %f4
+; VECTOR-NEXT: ldr %f10, %f2
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f11, %f0
+; VECTOR-NEXT: ldr %f0, %f10
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f10, %f0
+; VECTOR-NEXT: ldr %f0, %f9
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f6, %f0
+; VECTOR-NEXT: ldr %f0, %f11
+; VECTOR-NEXT: ldr %f2, %f10
+; VECTOR-NEXT: ldr %f4, %f9
+; VECTOR-NEXT: ld %f8, 184(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 176(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f10, 168(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f11, 160(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 304(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %Res = fadd <4 x half> %Op, %Op
+ ret <4 x half> %Res
+}
+
+; Test a vector extension.
+define <2 x half> @fun2(<2 x half> %Op) {
+; NOVEC-LABEL: fun2:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -176
+; NOVEC-NEXT: .cfi_def_cfa_offset 336
+; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: ler %f8, %f2
+; NOVEC-NEXT: brasl %r14, __extendhfdf2 at PLT
+; NOVEC-NEXT: ldr %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfdf2 at PLT
+; NOVEC-NEXT: adbr %f9, %f9
+; NOVEC-NEXT: ldr %f8, %f0
+; NOVEC-NEXT: adbr %f8, %f0
+; NOVEC-NEXT: ldr %f0, %f9
+; NOVEC-NEXT: brasl %r14, __truncdfhf2 at PLT
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ldr %f0, %f8
+; NOVEC-NEXT: brasl %r14, __truncdfhf2 at PLT
+; NOVEC-NEXT: ler %f2, %f0
+; NOVEC-NEXT: ler %f0, %f9
+; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 288(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun2:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -184
+; VECTOR-NEXT: .cfi_def_cfa_offset 344
+; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: ldr %f0, %f2
+; VECTOR-NEXT: brasl %r14, __extendhfdf2 at PLT
+; VECTOR-NEXT: # kill: def $f0d killed $f0d def $v0
+; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfdf2 at PLT
+; VECTOR-NEXT: vl %v1, 160(%r15), 3 # 16-byte Folded Reload
+; VECTOR-NEXT: # kill: def $f0d killed $f0d def $v0
+; VECTOR-NEXT: vmrhg %v0, %v0, %v1
+; VECTOR-NEXT: vfadb %v0, %v0, %v0
+; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill
+; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0
+; VECTOR-NEXT: brasl %r14, __truncdfhf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Folded Reload
+; VECTOR-NEXT: vrepg %v0, %v0, 1
+; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0
+; VECTOR-NEXT: brasl %r14, __truncdfhf2 at PLT
+; VECTOR-NEXT: ldr %f2, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 296(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %E = fpext <2 x half> %Op to <2 x double>
+ %Add = fadd <2 x double> %E, %E
+ %Res = fptrunc <2 x double> %Add to <2 x half>
+ ret <2 x half> %Res
+}
+
+; Load and store an <8 x half> vector.
+define void @fun3(ptr %Src, ptr %Dst) {
+; NOVEC-LABEL: fun3:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: lh %r0, 0(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: lh %r0, 2(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f1, %r0
+; NOVEC-NEXT: lh %r0, 4(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f2, %r0
+; NOVEC-NEXT: lh %r0, 6(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f3, %r0
+; NOVEC-NEXT: lh %r0, 8(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f4, %r0
+; NOVEC-NEXT: lh %r0, 10(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f5, %r0
+; NOVEC-NEXT: lh %r0, 12(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f6, %r0
+; NOVEC-NEXT: lh %r0, 14(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f7, %r0
+; NOVEC-NEXT: lgdr %r0, %f7
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 14(%r3)
+; NOVEC-NEXT: lgdr %r0, %f6
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 12(%r3)
+; NOVEC-NEXT: lgdr %r0, %f5
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 10(%r3)
+; NOVEC-NEXT: lgdr %r0, %f4
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 8(%r3)
+; NOVEC-NEXT: lgdr %r0, %f3
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 6(%r3)
+; NOVEC-NEXT: lgdr %r0, %f2
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 4(%r3)
+; NOVEC-NEXT: lgdr %r0, %f1
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 2(%r3)
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r3)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun3:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: vlreph %v0, 0(%r2)
+; VECTOR-NEXT: vlreph %v1, 2(%r2)
+; VECTOR-NEXT: vlreph %v2, 4(%r2)
+; VECTOR-NEXT: vlreph %v3, 6(%r2)
+; VECTOR-NEXT: vlreph %v4, 8(%r2)
+; VECTOR-NEXT: vlreph %v5, 10(%r2)
+; VECTOR-NEXT: vlreph %v6, 12(%r2)
+; VECTOR-NEXT: vlreph %v7, 14(%r2)
+; VECTOR-NEXT: vsteh %v7, 14(%r3), 0
+; VECTOR-NEXT: vsteh %v6, 12(%r3), 0
+; VECTOR-NEXT: vsteh %v5, 10(%r3), 0
+; VECTOR-NEXT: vsteh %v4, 8(%r3), 0
+; VECTOR-NEXT: vsteh %v3, 6(%r3), 0
+; VECTOR-NEXT: vsteh %v2, 4(%r3), 0
+; VECTOR-NEXT: vsteh %v1, 2(%r3), 0
+; VECTOR-NEXT: vsteh %v0, 0(%r3), 0
+; VECTOR-NEXT: br %r14
+entry:
+ %L = load <8 x half>, ptr %Src
+ store <8 x half> %L, ptr %Dst
+ ret void
+}
+
+; Call a function with <8 x half> argument and return values.
+declare <8 x half> @foo(<8 x half>)
+define void @fun4(ptr %Src, ptr %Dst) {
+; NOVEC-LABEL: fun4:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r13, %r15, 104(%r15)
+; NOVEC-NEXT: .cfi_offset %r13, -56
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -208
+; NOVEC-NEXT: .cfi_def_cfa_offset 368
+; NOVEC-NEXT: lh %r0, 0(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: lh %r0, 2(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f2, %r0
+; NOVEC-NEXT: # kill: def $f2h killed $f2h killed $f2d
+; NOVEC-NEXT: lh %r0, 4(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f4, %r0
+; NOVEC-NEXT: # kill: def $f4h killed $f4h killed $f4d
+; NOVEC-NEXT: lh %r0, 6(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f6, %r0
+; NOVEC-NEXT: # kill: def $f6h killed $f6h killed $f6d
+; NOVEC-NEXT: lh %r0, 8(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f1, %r0
+; NOVEC-NEXT: lh %r0, 10(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f3, %r0
+; NOVEC-NEXT: lh %r0, 12(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f5, %r0
+; NOVEC-NEXT: lh %r0, 14(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f7, %r0
+; NOVEC-NEXT: lgdr %r0, %f7
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 190(%r15)
+; NOVEC-NEXT: lgdr %r0, %f5
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 182(%r15)
+; NOVEC-NEXT: lgdr %r0, %f3
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 174(%r15)
+; NOVEC-NEXT: lgdr %r0, %f1
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: la %r2, 192(%r15)
+; NOVEC-NEXT: lgr %r13, %r3
+; NOVEC-NEXT: sth %r0, 166(%r15)
+; NOVEC-NEXT: brasl %r14, foo at PLT
+; NOVEC-NEXT: lh %r0, 192(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: lh %r0, 194(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f1, %r0
+; NOVEC-NEXT: lh %r0, 196(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f2, %r0
+; NOVEC-NEXT: lh %r0, 198(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f3, %r0
+; NOVEC-NEXT: lh %r0, 200(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f4, %r0
+; NOVEC-NEXT: lh %r0, 202(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f5, %r0
+; NOVEC-NEXT: lh %r0, 204(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f6, %r0
+; NOVEC-NEXT: lh %r0, 206(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f7, %r0
+; NOVEC-NEXT: lgdr %r0, %f7
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 14(%r13)
+; NOVEC-NEXT: lgdr %r0, %f6
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 12(%r13)
+; NOVEC-NEXT: lgdr %r0, %f5
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 10(%r13)
+; NOVEC-NEXT: lgdr %r0, %f4
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 8(%r13)
+; NOVEC-NEXT: lgdr %r0, %f3
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 6(%r13)
+; NOVEC-NEXT: lgdr %r0, %f2
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 4(%r13)
+; NOVEC-NEXT: lgdr %r0, %f1
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 2(%r13)
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r13)
+; NOVEC-NEXT: lmg %r13, %r15, 312(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun4:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -208
+; VECTOR-NEXT: .cfi_def_cfa_offset 368
+; VECTOR-NEXT: vlreph %v6, 6(%r2)
+; VECTOR-NEXT: vlreph %v4, 4(%r2)
+; VECTOR-NEXT: vlreph %v2, 2(%r2)
+; VECTOR-NEXT: vlreph %v0, 0(%r2)
+; VECTOR-NEXT: vlreph %v1, 8(%r2)
+; VECTOR-NEXT: vlreph %v3, 10(%r2)
+; VECTOR-NEXT: vlreph %v5, 12(%r2)
+; VECTOR-NEXT: vlreph %v7, 14(%r2)
+; VECTOR-NEXT: la %r2, 192(%r15)
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: vsteh %v7, 190(%r15), 0
+; VECTOR-NEXT: vsteh %v5, 182(%r15), 0
+; VECTOR-NEXT: vsteh %v3, 174(%r15), 0
+; VECTOR-NEXT: vsteh %v1, 166(%r15), 0
+; VECTOR-NEXT: brasl %r14, foo at PLT
+; VECTOR-NEXT: vlreph %v0, 192(%r15)
+; VECTOR-NEXT: vlreph %v1, 194(%r15)
+; VECTOR-NEXT: vlreph %v2, 196(%r15)
+; VECTOR-NEXT: vlreph %v3, 198(%r15)
+; VECTOR-NEXT: vlreph %v4, 200(%r15)
+; VECTOR-NEXT: vlreph %v5, 202(%r15)
+; VECTOR-NEXT: vlreph %v6, 204(%r15)
+; VECTOR-NEXT: vlreph %v7, 206(%r15)
+; VECTOR-NEXT: vsteh %v7, 14(%r13), 0
+; VECTOR-NEXT: vsteh %v6, 12(%r13), 0
+; VECTOR-NEXT: vsteh %v5, 10(%r13), 0
+; VECTOR-NEXT: vsteh %v4, 8(%r13), 0
+; VECTOR-NEXT: vsteh %v3, 6(%r13), 0
+; VECTOR-NEXT: vsteh %v2, 4(%r13), 0
+; VECTOR-NEXT: vsteh %v1, 2(%r13), 0
+; VECTOR-NEXT: vsteh %v0, 0(%r13), 0
+; VECTOR-NEXT: lmg %r13, %r15, 312(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %arg = load <8 x half>, ptr %Src
+ %Res = call <8 x half> @foo(<8 x half> %arg)
+ store <8 x half> %Res, ptr %Dst
+ ret void
+}
+
+; Receive and pass argument fully on stack.
+declare void @foo2(<4 x half> %dummy, <8 x half> %Arg5)
+define void @fun5(<4 x half> %dummy, <8 x half> %Arg5) {
+; NOVEC-LABEL: fun5:
+; NOVEC: # %bb.0:
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -256
+; NOVEC-NEXT: .cfi_def_cfa_offset 416
+; NOVEC-NEXT: std %f8, 248(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 240(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f10, 232(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f11, 224(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: .cfi_offset %f10, -184
+; NOVEC-NEXT: .cfi_offset %f11, -192
+; NOVEC-NEXT: lh %r0, 422(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f1, %r0
+; NOVEC-NEXT: lh %r0, 430(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f3, %r0
+; NOVEC-NEXT: lh %r0, 438(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f5, %r0
+; NOVEC-NEXT: lh %r0, 446(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f7, %r0
+; NOVEC-NEXT: lh %r0, 454(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f8, %r0
+; NOVEC-NEXT: lh %r0, 462(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f9, %r0
+; NOVEC-NEXT: lh %r0, 470(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f10, %r0
+; NOVEC-NEXT: lh %r0, 478(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f11, %r0
+; NOVEC-NEXT: lgdr %r0, %f11
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 222(%r15)
+; NOVEC-NEXT: lgdr %r0, %f10
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 214(%r15)
+; NOVEC-NEXT: lgdr %r0, %f9
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 206(%r15)
+; NOVEC-NEXT: lgdr %r0, %f8
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 198(%r15)
+; NOVEC-NEXT: lgdr %r0, %f7
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 190(%r15)
+; NOVEC-NEXT: lgdr %r0, %f5
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 182(%r15)
+; NOVEC-NEXT: lgdr %r0, %f3
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 174(%r15)
+; NOVEC-NEXT: lgdr %r0, %f1
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 166(%r15)
+; NOVEC-NEXT: brasl %r14, foo2 at PLT
+; NOVEC-NEXT: ld %f8, 248(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 240(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f10, 232(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f11, 224(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 368(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun5:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -224
+; VECTOR-NEXT: .cfi_def_cfa_offset 384
+; VECTOR-NEXT: vlreph %v1, 390(%r15)
+; VECTOR-NEXT: vlreph %v3, 398(%r15)
+; VECTOR-NEXT: vlreph %v5, 406(%r15)
+; VECTOR-NEXT: vlreph %v7, 414(%r15)
+; VECTOR-NEXT: vlreph %v16, 422(%r15)
+; VECTOR-NEXT: vlreph %v17, 430(%r15)
+; VECTOR-NEXT: vlreph %v18, 438(%r15)
+; VECTOR-NEXT: vlreph %v19, 446(%r15)
+; VECTOR-NEXT: vsteh %v19, 222(%r15), 0
+; VECTOR-NEXT: vsteh %v18, 214(%r15), 0
+; VECTOR-NEXT: vsteh %v17, 206(%r15), 0
+; VECTOR-NEXT: vsteh %v16, 198(%r15), 0
+; VECTOR-NEXT: vsteh %v7, 190(%r15), 0
+; VECTOR-NEXT: vsteh %v5, 182(%r15), 0
+; VECTOR-NEXT: vsteh %v3, 174(%r15), 0
+; VECTOR-NEXT: vsteh %v1, 166(%r15), 0
+; VECTOR-NEXT: brasl %r14, foo2 at PLT
+; VECTOR-NEXT: lmg %r14, %r15, 336(%r15)
+; VECTOR-NEXT: br %r14
+ call void @foo2(<4 x half> %dummy, <8 x half> %Arg5)
+ ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-half.ll b/llvm/test/CodeGen/SystemZ/fp-half.ll
new file mode 100644
index 000000000000000..cd4aa12c2b4ef0e
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-half.ll
@@ -0,0 +1,627 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=NOVEC
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=VECTOR
+;
+; Tests for 16-bit floating point (half).
+
+; Incoming half arguments added together and returned.
+define half @fun0(half %Op0, half %Op1) {
+; NOVEC-LABEL: fun0:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -176
+; NOVEC-NEXT: .cfi_def_cfa_offset 336
+; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: ler %f8, %f0
+; NOVEC-NEXT: ler %f0, %f2
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f9
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 288(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun0:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -176
+; VECTOR-NEXT: .cfi_def_cfa_offset 336
+; VECTOR-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: ldr %f0, %f2
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f9
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 288(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %Res = fadd half %Op0, %Op1
+ ret half %Res
+}
+
+define half @fun1(half %Op0, half %Op1) {
+; NOVEC-LABEL: fun1:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -176
+; NOVEC-NEXT: .cfi_def_cfa_offset 336
+; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: ler %f8, %f2
+; NOVEC-NEXT: brasl %r14, __extendhfdf2 at PLT
+; NOVEC-NEXT: ldr %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfdf2 at PLT
+; NOVEC-NEXT: adbr %f0, %f9
+; NOVEC-NEXT: brasl %r14, __truncdfhf2 at PLT
+; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 288(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun1:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -176
+; VECTOR-NEXT: .cfi_def_cfa_offset 336
+; VECTOR-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: ldr %f8, %f2
+; VECTOR-NEXT: brasl %r14, __extendhfdf2 at PLT
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfdf2 at PLT
+; VECTOR-NEXT: wfadb %f0, %f9, %f0
+; VECTOR-NEXT: brasl %r14, __truncdfhf2 at PLT
+; VECTOR-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 288(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %E0 = fpext half %Op0 to double
+ %E1 = fpext half %Op1 to double
+ %Add = fadd double %E0, %E1
+ %Res = fptrunc double %Add to half
+ ret half %Res
+}
+
+define half @fun2(half %Op0, half %Op1) {
+; NOVEC-LABEL: fun2:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -232
+; NOVEC-NEXT: .cfi_def_cfa_offset 392
+; NOVEC-NEXT: std %f8, 224(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 216(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f11, 208(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: .cfi_offset %f11, -184
+; NOVEC-NEXT: la %r2, 160(%r15)
+; NOVEC-NEXT: ler %f8, %f2
+; NOVEC-NEXT: brasl %r14, __extendhftf2 at PLT
+; NOVEC-NEXT: ld %f9, 160(%r15)
+; NOVEC-NEXT: ld %f11, 168(%r15)
+; NOVEC-NEXT: la %r2, 176(%r15)
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhftf2 at PLT
+; NOVEC-NEXT: ld %f0, 176(%r15)
+; NOVEC-NEXT: ld %f2, 184(%r15)
+; NOVEC-NEXT: la %r2, 192(%r15)
+; NOVEC-NEXT: axbr %f0, %f9
+; NOVEC-NEXT: std %f0, 192(%r15)
+; NOVEC-NEXT: std %f2, 200(%r15)
+; NOVEC-NEXT: brasl %r14, __trunctfhf2 at PLT
+; NOVEC-NEXT: ld %f8, 224(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 216(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f11, 208(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 344(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun2:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -232
+; VECTOR-NEXT: .cfi_def_cfa_offset 392
+; VECTOR-NEXT: std %f8, 224(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: la %r2, 176(%r15)
+; VECTOR-NEXT: ldr %f8, %f2
+; VECTOR-NEXT: brasl %r14, __extendhftf2 at PLT
+; VECTOR-NEXT: vl %v0, 176(%r15), 3
+; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill
+; VECTOR-NEXT: la %r2, 192(%r15)
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhftf2 at PLT
+; VECTOR-NEXT: vl %v0, 192(%r15), 3
+; VECTOR-NEXT: vl %v1, 160(%r15), 3 # 16-byte Folded Reload
+; VECTOR-NEXT: wfaxb %v0, %v1, %v0
+; VECTOR-NEXT: la %r2, 208(%r15)
+; VECTOR-NEXT: vst %v0, 208(%r15), 3
+; VECTOR-NEXT: brasl %r14, __trunctfhf2 at PLT
+; VECTOR-NEXT: ld %f8, 224(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 344(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %E0 = fpext half %Op0 to fp128
+ %E1 = fpext half %Op1 to fp128
+ %Add = fadd fp128 %E0, %E1
+ %Res = fptrunc fp128 %Add to half
+ ret half %Res
+}
+
+; Test loading and storing a half value.
+define void @fun3(ptr %Src, ptr %Dst) {
+; NOVEC-LABEL: fun3:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: lh %r0, 0(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r3)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun3:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: vlreph %v0, 0(%r2)
+; VECTOR-NEXT: vsteh %v0, 0(%r3), 0
+; VECTOR-NEXT: br %r14
+entry:
+ %L = load half, ptr %Src, align 2
+ store half %L, ptr %Dst, align 2
+ ret void
+}
+
+define void @fun4(ptr %Src, ptr %Dst) {
+; NOVEC-LABEL: fun4:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r13, %r15, 104(%r15)
+; NOVEC-NEXT: .cfi_offset %r13, -56
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -160
+; NOVEC-NEXT: .cfi_def_cfa_offset 320
+; NOVEC-NEXT: lh %r0, 0(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: lgr %r13, %r3
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: brasl %r14, __extendhfdf2 at PLT
+; NOVEC-NEXT: adbr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncdfhf2 at PLT
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r13)
+; NOVEC-NEXT: lmg %r13, %r15, 264(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun4:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: vlreph %v0, 0(%r2)
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: brasl %r14, __extendhfdf2 at PLT
+; VECTOR-NEXT: adbr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncdfhf2 at PLT
+; VECTOR-NEXT: vsteh %v0, 0(%r13), 0
+; VECTOR-NEXT: lmg %r13, %r15, 264(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %Op0 = load half, ptr %Src, align 2
+ %E0 = fpext half %Op0 to double
+ %Add = fadd double %E0, %E0
+ %Res = fptrunc double %Add to half
+ store half %Res, ptr %Dst, align 2
+ ret void
+}
+
+define void @fun5(ptr %Src, ptr %Dst) {
+; NOVEC-LABEL: fun5:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r13, %r15, 104(%r15)
+; NOVEC-NEXT: .cfi_offset %r13, -56
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -192
+; NOVEC-NEXT: .cfi_def_cfa_offset 352
+; NOVEC-NEXT: lh %r0, 0(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: la %r2, 160(%r15)
+; NOVEC-NEXT: lgr %r13, %r3
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: brasl %r14, __extendhftf2 at PLT
+; NOVEC-NEXT: ld %f0, 160(%r15)
+; NOVEC-NEXT: ld %f2, 168(%r15)
+; NOVEC-NEXT: la %r2, 176(%r15)
+; NOVEC-NEXT: axbr %f0, %f0
+; NOVEC-NEXT: std %f0, 176(%r15)
+; NOVEC-NEXT: std %f2, 184(%r15)
+; NOVEC-NEXT: brasl %r14, __trunctfhf2 at PLT
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r13)
+; NOVEC-NEXT: lmg %r13, %r15, 296(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun5:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -192
+; VECTOR-NEXT: .cfi_def_cfa_offset 352
+; VECTOR-NEXT: vlreph %v0, 0(%r2)
+; VECTOR-NEXT: la %r2, 160(%r15)
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: brasl %r14, __extendhftf2 at PLT
+; VECTOR-NEXT: vl %v0, 160(%r15), 3
+; VECTOR-NEXT: wfaxb %v0, %v0, %v0
+; VECTOR-NEXT: la %r2, 176(%r15)
+; VECTOR-NEXT: vst %v0, 176(%r15), 3
+; VECTOR-NEXT: brasl %r14, __trunctfhf2 at PLT
+; VECTOR-NEXT: vsteh %v0, 0(%r13), 0
+; VECTOR-NEXT: lmg %r13, %r15, 296(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %Op0 = load half, ptr %Src, align 2
+ %E0 = fpext half %Op0 to fp128
+ %Add = fadd fp128 %E0, %E0
+ %Res = fptrunc fp128 %Add to half
+ store half %Res, ptr %Dst, align 2
+ ret void
+}
+
+; Test a chain of half operations which should have each operation surrounded
+; by conversions to/from fp32 for proper emulation.
+define half @fun6(half %Op0, half %Op1, half %Op2) {
+; NOVEC-LABEL: fun6:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -184
+; NOVEC-NEXT: .cfi_def_cfa_offset 344
+; NOVEC-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f10, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: .cfi_offset %f10, -184
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f2
+; NOVEC-NEXT: ler %f8, %f4
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ler %f10, %f0
+; NOVEC-NEXT: ler %f0, %f9
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f10
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f9
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 296(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun6:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -184
+; VECTOR-NEXT: .cfi_def_cfa_offset 344
+; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 168(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f10, 160(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: .cfi_offset %f10, -184
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f2
+; VECTOR-NEXT: ldr %f8, %f4
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f10, %f0
+; VECTOR-NEXT: ldr %f0, %f9
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f10
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: wfasb %f0, %f9, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 296(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %A0 = fadd half %Op0, %Op1
+ %Res = fadd half %A0, %Op2
+ ret half %Res
+}
+
+; Store an incoming half argument and return a loaded one.
+define half @fun7(half %Op0, ptr %Dst, ptr %Src) {
+; NOVEC-LABEL: fun7:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r2)
+; NOVEC-NEXT: lh %r0, 0(%r3)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun7:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: vsteh %v0, 0(%r2), 0
+; VECTOR-NEXT: vlreph %v0, 0(%r3)
+; VECTOR-NEXT: br %r14
+entry:
+ store half %Op0, ptr %Dst
+ %Res = load half, ptr %Src
+ ret half %Res
+}
+
+; Call a function with half argument and return values.
+declare half @foo(half)
+define void @fun8(ptr %Src, ptr %Dst) {
+; NOVEC-LABEL: fun8:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r13, %r15, 104(%r15)
+; NOVEC-NEXT: .cfi_offset %r13, -56
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -160
+; NOVEC-NEXT: .cfi_def_cfa_offset 320
+; NOVEC-NEXT: lh %r0, 0(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: lgr %r13, %r3
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: brasl %r14, foo at PLT
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r13)
+; NOVEC-NEXT: lmg %r13, %r15, 264(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun8:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: vlreph %v0, 0(%r2)
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: brasl %r14, foo at PLT
+; VECTOR-NEXT: vsteh %v0, 0(%r13), 0
+; VECTOR-NEXT: lmg %r13, %r15, 264(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %arg = load half, ptr %Src
+ %Res = call half @foo(half %arg)
+ store half %Res, ptr %Dst
+ ret void
+}
+
+; Receive stack argument.
+define half @fun9(half %Arg0, half %Arg1, half %Arg2, half %Arg3, half %Arg4) {
+; NOVEC-LABEL: fun9:
+; NOVEC: # %bb.0:
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -176
+; NOVEC-NEXT: .cfi_def_cfa_offset 336
+; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: lh %r0, 342(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ler %f8, %f6
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f9
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 288(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun9:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -176
+; VECTOR-NEXT: .cfi_def_cfa_offset 336
+; VECTOR-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: vlreph %v0, 342(%r15)
+; VECTOR-NEXT: ldr %f8, %f6
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f9
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 288(%r15)
+; VECTOR-NEXT: br %r14
+ %A0 = fadd half %Arg3, %Arg4
+ ret half %A0
+}
+
+; Pass stack argument.
+define void @fun10(half %Arg0) {
+; NOVEC-LABEL: fun10:
+; NOVEC: # %bb.0:
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -168
+; NOVEC-NEXT: .cfi_def_cfa_offset 328
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: ler %f2, %f0
+; NOVEC-NEXT: ler %f4, %f0
+; NOVEC-NEXT: ler %f6, %f0
+; NOVEC-NEXT: sth %r0, 166(%r15)
+; NOVEC-NEXT: brasl %r14, fun9 at PLT
+; NOVEC-NEXT: lmg %r14, %r15, 280(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun10:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -168
+; VECTOR-NEXT: .cfi_def_cfa_offset 328
+; VECTOR-NEXT: ldr %f2, %f0
+; VECTOR-NEXT: ldr %f4, %f0
+; VECTOR-NEXT: ldr %f6, %f0
+; VECTOR-NEXT: vsteh %v0, 166(%r15), 0
+; VECTOR-NEXT: brasl %r14, fun9 at PLT
+; VECTOR-NEXT: lmg %r14, %r15, 280(%r15)
+; VECTOR-NEXT: br %r14
+ call void @fun9(half %Arg0, half %Arg0, half %Arg0, half %Arg0, half %Arg0)
+ ret void
+}
+
+; Test loading some immediates from the Constant Pool.
+declare void @foo2(half, half, half, half)
+define void @fun11() {
+; NOVEC-LABEL: fun11:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -160
+; NOVEC-NEXT: .cfi_def_cfa_offset 320
+; NOVEC-NEXT: lhrl %r0, .LCPI11_0
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f4, %r0
+; NOVEC-NEXT: lhrl %r0, .LCPI11_1
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: # kill: def $f4h killed $f4h killed $f4d
+; NOVEC-NEXT: lzer %f2
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: lcdfr %f0, %f2
+; NOVEC-NEXT: ldgr %f6, %r0
+; NOVEC-NEXT: # kill: def $f6h killed $f6h killed $f6d
+; NOVEC-NEXT: brasl %r14, foo2 at PLT
+; NOVEC-NEXT: lmg %r14, %r15, 272(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun11:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: lzer %f2
+; VECTOR-NEXT: vrepih %v4, 13824
+; VECTOR-NEXT: vrepih %v6, 15360
+; VECTOR-NEXT: lcdfr %f0, %f2
+; VECTOR-NEXT: brasl %r14, foo2 at PLT
+; VECTOR-NEXT: lmg %r14, %r15, 272(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ call void @foo2(half -0.0, half 0.0, half 0.375, half 1.0)
+ ret void
+}
+
+; Test a tail call.
+declare void @foo3(half)
+define void @fun12(half %Arg0) {
+; NOVEC-LABEL: fun12:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: jg foo3 at PLT
+;
+; VECTOR-LABEL: fun12:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: jg foo3 at PLT
+entry:
+ tail call void @foo3(half %Arg0)
+ ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-round-03.ll b/llvm/test/CodeGen/SystemZ/fp-round-03.ll
index d35cafc406ad774..e7a9c0fa6e87aa6 100644
--- a/llvm/test/CodeGen/SystemZ/fp-round-03.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-round-03.ll
@@ -1,6 +1,19 @@
; Test rounding functions for z14 and above.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 -verify-machineinstrs \
+; RUN: | FileCheck %s
+
+; Test that an f16 intrinsic can be lowered with promotion to float.
+declare half @llvm.rint.f16(half %f)
+define half @f0(half %f) {
+; CHECK-LABEL: f0:
+; CHECK: brasl %r14, __extendhfsf2 at PLT
+; CHECK: fiebra %f0, 0, %f0, 0
+; CHECK: brasl %r14, __truncsfhf2 at PLT
+; CHECK: br %r14
+ %res = call half @llvm.rint.f16(half %f)
+ ret half %res
+}
; Test rint for f32.
declare float @llvm.rint.f32(float %f)
diff --git a/llvm/test/CodeGen/SystemZ/spill-half-01.mir b/llvm/test/CodeGen/SystemZ/spill-half-01.mir
new file mode 100644
index 000000000000000..56f4ecbffd2c639
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/spill-half-01.mir
@@ -0,0 +1,47 @@
+# RUN: llc -o - %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \
+# RUN: -start-before=greedy | FileCheck %s -check-prefix=CHECK
+# RUN: llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
+# RUN: -start-before=greedy | FileCheck %s -check-prefix=VECTOR
+
+# Test spilling / reloading of an fp16bit virtual register.
+
+---
+name: fun0
+alignment: 16
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: fp16bit }
+liveins:
+ - { reg: '$f0h', virtual-reg: '%0' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0:
+ liveins: $f0h
+
+ ; CHECK-LABEL: fun0:
+ ; CHECK-NOT: $f0
+ ; CHECK: # kill: def $f0h killed $f0h killed $f0d def $f0d
+ ; CHECK-NEXT: lgdr %r0, %f0
+ ; CHECK-NEXT: srlg %r0, %r0, 48
+ ; CHECK-NEXT: sth %r0, 166(%r15) # 2-byte Folded Spill
+ ; CHECK-NEXT: #APP
+ ; CHECK-NEXT: #NO_APP
+ ; CHECK: lh %r0, 166(%r15) # 2-byte Folded Reload
+ ; CHECK-NEXT: sllg %r0, %r0, 48
+ ; CHECK-NEXT: ldgr %f0, %r0
+ ; CHECK: # kill: def $f0h killed $f0h killed $f0d
+ ; CHECK-NOT: $f0
+
+ ; VECTOR-LABEL: fun0:
+ ; VECTOR: vsteh %v0, 166(%r15), 0 # 2-byte Folded Spill
+ ; VECTOR-NEXT: #APP
+ ; VECTOR-NEXT: #NO_APP
+ ; VECTOR-NEXT: vlreph %v0, 166(%r15) # 2-byte Folded Reload
+
+ %0:fp16bit = COPY $f0h
+ INLINEASM &"", 1, 12, implicit-def dead early-clobber $r0d, 12, implicit-def dead early-clobber $r1d, 12, implicit-def dead early-clobber $r2d, 12, implicit-def dead early-clobber $r3d, 12, implicit-def dead early-clobber $r4d, 12, implicit-def dead early-clobber $r5d, 12, implicit-def dead early-clobber $r6d, 12, implicit-def dead early-clobber $r7d, 12, implicit-def dead early-clobber $r8d, 12, implicit-def dead early-clobber $r9d, 12, implicit-def dead early-clobber $r10d, 12, implicit-def dead early-clobber $r11d, 12, implicit-def dead early-clobber $r12d, 12, implicit-def dead early-clobber $r13d, 12, implicit-def dead early-clobber $r14d, 12, implicit-def dead early-clobber $f0d, 12, implicit-def dead early-clobber $f1d, 12, implicit-def dead early-clobber $f2d, 12, implicit-def dead early-clobber $f3d, 12, implicit-def dead early-clobber $f4d, 12, implicit-def dead early-clobber $f5d, 12, implicit-def dead early-clobber $f6d, 12, implicit-def dead early-clobber $f7d, 12, implicit-def dead early-clobber $f8d, 12, implicit-def dead early-clobber $f9d, 12, implicit-def dead early-clobber $f10d, 12, implicit-def dead early-clobber $f11d, 12, implicit-def dead early-clobber $f12d, 12, implicit-def dead early-clobber $f13d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f15d
+ $f0h = COPY %0
+ Return implicit $f0h
+...
diff --git a/llvm/test/CodeGen/SystemZ/spill-half-02.mir b/llvm/test/CodeGen/SystemZ/spill-half-02.mir
new file mode 100644
index 000000000000000..4934d0b72811575
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/spill-half-02.mir
@@ -0,0 +1,40 @@
+# RUN: llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
+# RUN: -start-before=greedy | FileCheck %s
+
+# Test spilling / reloading of an vr16bit virtual register.
+
+---
+name: fun0
+alignment: 16
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: addr64bit }
+ - { id: 1, class: addr64bit }
+ - { id: 2, class: vr16bit }
+liveins:
+ - { reg: '$r2d', virtual-reg: '%0' }
+ - { reg: '$r3d', virtual-reg: '%1' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0:
+ liveins: $r2d, $r3d
+
+ ; CHECK-LABEL: fun0:
+ ; CHECK: stg %r3, 168(%r15) # 8-byte Folded Spill
+ ; CHECK-NEXT: vlreph %v0, 0(%r2)
+ ; CHECK-NEXT: vsteh %v0, 166(%r15), 0 # 2-byte Folded Spill
+ ; CHECK-NEXT: #APP
+ ; CHECK-NEXT: #NO_APP
+ ; CHECK-NEXT: lg %r1, 168(%r15) # 8-byte Folded Reload
+ ; CHECK-NEXT: vlreph %v0, 166(%r15) # 2-byte Folded Reload
+ ; CHECK-NEXT: vsteh %v0, 0(%r1), 0
+
+ %1:addr64bit = COPY $r3d
+ %0:addr64bit = COPY $r2d
+ %2:vr16bit = VL16 %0, 0, $noreg
+ INLINEASM &"", 1, 12, implicit-def dead early-clobber $r0d, 12, implicit-def dead early-clobber $r1d, 12, implicit-def dead early-clobber $r2d, 12, implicit-def dead early-clobber $r3d, 12, implicit-def dead early-clobber $r4d, 12, implicit-def dead early-clobber $r5d, 12, implicit-def dead early-clobber $r6d, 12, implicit-def dead early-clobber $r7d, 12, implicit-def dead early-clobber $r8d, 12, implicit-def dead early-clobber $r9d, 12, implicit-def dead early-clobber $r10d, 12, implicit-def dead early-clobber $r11d, 12, implicit-def dead early-clobber $r12d, 12, implicit-def dead early-clobber $r13d, 12, implicit-def dead early-clobber $r14d, 12, implicit-def dead early-clobber $f0d, 12, implicit-def dead early-clobber $f1d, 12, implicit-def dead early-clobber $f2d, 12, implicit-def dead early-clobber $f3d, 12, implicit-def dead early-clobber $f4d, 12, implicit-def dead early-clobber $f5d, 12, implicit-def dead early-clobber $f6d, 12, implicit-def dead early-clobber $f7d, 12, implicit-def dead early-clobber $f8d, 12, implicit-def dead early-clobber $f9d, 12, implicit-def dead early-clobber $f10d, 12, implicit-def dead early-clobber $f11d, 12, implicit-def dead early-clobber $f12d, 12, implicit-def dead early-clobber $f13d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f15d, 12, implicit-def dead early-clobber $f16d, 12, implicit-def dead early-clobber $f17d, 12, implicit-def dead early-clobber $f18d, 12, implicit-def dead early-clobber $f19d, 12, implicit-def dead early-clobber $f20d, 12, implicit-def dead early-clobber $f21d, 12, implicit-def dead early-clobber $f22d, 12, implicit-def dead early-clobber $f23d, 12, implicit-def dead early-clobber $f24d, 12, implicit-def dead early-clobber $f25d, 12, implicit-def dead early-clobber $f26d, 12, implicit-def dead early-clobber $f27d, 12, implicit-def dead early-clobber $f28d, 12, implicit-def dead early-clobber $f29d, 12, implicit-def dead early-clobber $f30d, 12, implicit-def dead early-clobber $f31d
+ VST16 %2, %1, 0, $noreg
+ Return
+...
diff --git a/llvm/test/CodeGen/SystemZ/twoaddr-kill.mir b/llvm/test/CodeGen/SystemZ/twoaddr-kill.mir
index 7fc7bd3e347bb58..95ba0b4bf346638 100644
--- a/llvm/test/CodeGen/SystemZ/twoaddr-kill.mir
+++ b/llvm/test/CodeGen/SystemZ/twoaddr-kill.mir
@@ -18,19 +18,19 @@ body: |
; CHECK-NEXT: $r2l = COPY [[COPY]]
; CHECK-NEXT: $r3l = COPY killed [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]]:grh32bit = COPY killed [[COPY1]]
- ; CHECK-NEXT: INLINEASM &"stepa $1, $2, $3", 0 /* attdialect */, 393226 /* regdef:GRH32Bit */, def [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3), 9 /* reguse */, killed $r2l, 9 /* reguse */, killed $r3l
+ ; CHECK-NEXT: INLINEASM &"stepa $1, $2, $3", 0 /* attdialect */, 524298 /* regdef:GRH32Bit */, def [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3), 9 /* reguse */, killed $r2l, 9 /* reguse */, killed $r3l
; CHECK-NEXT: [[COPY3:%[0-9]+]]:grh32bit = COPY killed [[COPY2]]
; CHECK-NEXT: [[COPY4:%[0-9]+]]:grh32bit = COPY [[COPY3]]
- ; CHECK-NEXT: INLINEASM &"stepb $1, $2", 0 /* attdialect */, 393227 /* regdef-ec:GRH32Bit */, def early-clobber [[COPY4]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 393225 /* reguse:GRH32Bit */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"stepb $1, $2", 0 /* attdialect */, 524299 /* regdef-ec:GRH32Bit */, def early-clobber [[COPY4]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 524297 /* reguse:GRH32Bit */, [[COPY3]]
; CHECK-NEXT: $r2l = COPY killed [[COPY4]]
; CHECK-NEXT: Return implicit killed $r2l
%0:gr32bit = COPY killed $r2l
%2:grh32bit = COPY %0
$r2l = COPY %0
$r3l = COPY killed %0
- INLINEASM &"stepa $1, $2, $3", 0 /* attdialect */, 393226 /* regdef:GRH32Bit */, def %1:grh32bit, 2147483657 /* reguse tiedto:$0 */, killed %2(tied-def 3), 9 /* reguse */, killed $r2l, 9 /* reguse */, killed $r3l
+ INLINEASM &"stepa $1, $2, $3", 0 /* attdialect */, 524298 /* regdef:GRH32Bit */, def %1:grh32bit, 2147483657 /* reguse tiedto:$0 */, killed %2(tied-def 3), 9 /* reguse */, killed $r2l, 9 /* reguse */, killed $r3l
%4:grh32bit = COPY killed %1
- INLINEASM &"stepb $1, $2", 0 /* attdialect */, 393227 /* regdef-ec:GRH32Bit */, def early-clobber %3:grh32bit, 2147483657 /* reguse tiedto:$0 */, killed %4(tied-def 3), 393225 /* reguse:GRH32Bit */, %4
+ INLINEASM &"stepb $1, $2", 0 /* attdialect */, 524299 /* regdef-ec:GRH32Bit */, def early-clobber %3:grh32bit, 2147483657 /* reguse tiedto:$0 */, killed %4(tied-def 3), 524297 /* reguse:GRH32Bit */, %4
$r2l = COPY killed %3
Return implicit killed $r2l
...
>From 3e31dce36fc14f80762e2acb117f42129ac6dc77 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Wed, 27 Nov 2024 15:13:31 -0600
Subject: [PATCH 2/3] Review updates
---
clang/lib/Basic/Targets/SystemZ.h | 4 +-
clang/lib/CodeGen/Targets/SystemZ.cpp | 8 +-
.../test/CodeGen/SystemZ/systemz-inline-asm.c | 8 +
llvm/docs/LangRef.rst | 2 +-
.../SystemZ/AsmParser/SystemZAsmParser.cpp | 2 +
.../MCTargetDesc/SystemZMCTargetDesc.cpp | 150 ++++++++----------
llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp | 38 +++--
.../Target/SystemZ/SystemZISelLowering.cpp | 57 ++++---
llvm/lib/Target/SystemZ/SystemZInstrFP.td | 6 +-
llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 32 ++--
llvm/lib/Target/SystemZ/SystemZInstrVector.td | 2 +
llvm/lib/Target/SystemZ/SystemZScheduleZ13.td | 4 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ14.td | 4 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ15.td | 4 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ16.td | 4 +-
llvm/test/CodeGen/SystemZ/asm-10.ll | 9 ++
llvm/test/CodeGen/SystemZ/asm-17.ll | 11 ++
llvm/test/CodeGen/SystemZ/asm-19.ll | 19 +++
llvm/test/CodeGen/SystemZ/fp-cmp-04.ll | 68 +++++++-
llvm/test/CodeGen/SystemZ/fp-half-move.ll | 59 +++++++
llvm/test/CodeGen/SystemZ/fp-strict-cmp-04.ll | 114 ++++++++++++-
.../test/CodeGen/SystemZ/fp-strict-cmps-04.ll | 37 +++++
...-asm-fp-int-casting-explicit-regs-zEC12.ll | 52 ++++++
...inline-asm-fp-int-casting-explicit-regs.ll | 40 +++++
.../inline-asm-fp-int-casting-zEC12.ll | 46 ++++++
.../SystemZ/inline-asm-fp-int-casting.ll | 52 ++++++
26 files changed, 681 insertions(+), 151 deletions(-)
create mode 100644 llvm/test/CodeGen/SystemZ/fp-half-move.ll
diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h
index b4da2c9ce64754a..107eb6aafa6b604 100644
--- a/clang/lib/Basic/Targets/SystemZ.h
+++ b/clang/lib/Basic/Targets/SystemZ.h
@@ -107,9 +107,7 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
unsigned getMinGlobalAlign(uint64_t Size, bool HasNonWeakDef) const override;
- bool useFP16ConversionIntrinsics() const override {
- return false;
- }
+ bool useFP16ConversionIntrinsics() const override { return false; }
void getTargetDefines(const LangOptions &Opts,
MacroBuilder &Builder) const override;
diff --git a/clang/lib/CodeGen/Targets/SystemZ.cpp b/clang/lib/CodeGen/Targets/SystemZ.cpp
index 021d764dbfd063e..9830dd7e2a663e0 100644
--- a/clang/lib/CodeGen/Targets/SystemZ.cpp
+++ b/clang/lib/CodeGen/Targets/SystemZ.cpp
@@ -185,7 +185,7 @@ bool SystemZABIInfo::isFPArgumentType(QualType Ty) const {
if (const BuiltinType *BT = Ty->getAs<BuiltinType>())
switch (BT->getKind()) {
- case BuiltinType::Float16: // _Float16
+ case BuiltinType::Float16: // _Float16
case BuiltinType::Float:
case BuiltinType::Double:
return true;
@@ -450,9 +450,9 @@ ABIArgInfo SystemZABIInfo::classifyArgumentType(QualType Ty) const {
if (isFPArgumentType(SingleElementTy)) {
assert(Size == 16 || Size == 32 || Size == 64);
return ABIArgInfo::getDirect(
- Size == 16 ? llvm::Type::getHalfTy(getVMContext())
- : Size == 32 ? llvm::Type::getFloatTy(getVMContext())
- : llvm::Type::getDoubleTy(getVMContext()));
+ Size == 16 ? llvm::Type::getHalfTy(getVMContext())
+ : Size == 32 ? llvm::Type::getFloatTy(getVMContext())
+ : llvm::Type::getDoubleTy(getVMContext()));
} else {
llvm::IntegerType *PassTy = llvm::IntegerType::get(getVMContext(), Size);
return Size <= 32 ? ABIArgInfo::getNoExtend(PassTy)
diff --git a/clang/test/CodeGen/SystemZ/systemz-inline-asm.c b/clang/test/CodeGen/SystemZ/systemz-inline-asm.c
index 2a9d6a5f8745480..919250e2170d9c3 100644
--- a/clang/test/CodeGen/SystemZ/systemz-inline-asm.c
+++ b/clang/test/CodeGen/SystemZ/systemz-inline-asm.c
@@ -106,6 +106,14 @@ void test_M(void) {
// CHECK: call void asm sideeffect "#FOO $0", "M"(i32 2147483647)
}
+_Float16 test_f16(_Float16 a) {
+ _Float16 f;
+ asm("ler %0, %1" : "=f" (f) : "f" (a));
+ return f;
+// CHECK-LABEL: define{{.*}} half @test_f16(half noundef %a)
+// CHECK: call half asm "ler $0, $1", "=f,f"(half %a)
+}
+
float test_f32(float f, float g) {
asm("aebr %0, %2" : "=f" (f) : "0" (f), "f" (g));
return f;
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index abfd2fdfb9de716..0239d27c856e9f6 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -5555,7 +5555,7 @@ SystemZ:
address context evaluates as zero).
- ``h``: A 32-bit value in the high part of a 64bit data register
(LLVM-specific)
-- ``f``: A 32, 64, or 128-bit floating-point register.
+- ``f``: A 16, 32, 64, or 128-bit floating-point register.
X86:
diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 7f528918850261d..2a12a1a782491c9 100644
--- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -888,6 +888,7 @@ ParseStatus SystemZAsmParser::parseRegister(OperandVector &Operands,
return ParseStatus::NoMatch;
// Determine the LLVM register number according to Kind.
+ // clang-format off
const unsigned *Regs;
switch (Kind) {
case GR32Reg: Regs = SystemZMC::GR32Regs; break;
@@ -905,6 +906,7 @@ ParseStatus SystemZAsmParser::parseRegister(OperandVector &Operands,
case AR32Reg: Regs = SystemZMC::AR32Regs; break;
case CR64Reg: Regs = SystemZMC::CR64Regs; break;
}
+ // clang-format on
if (Regs[Reg.Num] == 0)
return Error(Reg.StartLoc, "invalid register pair");
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 291b6789c78f697..d611fe493f8959e 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -34,118 +34,96 @@ using namespace llvm;
#include "SystemZGenRegisterInfo.inc"
const unsigned SystemZMC::GR32Regs[16] = {
- SystemZ::R0L, SystemZ::R1L, SystemZ::R2L, SystemZ::R3L,
- SystemZ::R4L, SystemZ::R5L, SystemZ::R6L, SystemZ::R7L,
- SystemZ::R8L, SystemZ::R9L, SystemZ::R10L, SystemZ::R11L,
- SystemZ::R12L, SystemZ::R13L, SystemZ::R14L, SystemZ::R15L
-};
+ SystemZ::R0L, SystemZ::R1L, SystemZ::R2L, SystemZ::R3L,
+ SystemZ::R4L, SystemZ::R5L, SystemZ::R6L, SystemZ::R7L,
+ SystemZ::R8L, SystemZ::R9L, SystemZ::R10L, SystemZ::R11L,
+ SystemZ::R12L, SystemZ::R13L, SystemZ::R14L, SystemZ::R15L};
const unsigned SystemZMC::GRH32Regs[16] = {
- SystemZ::R0H, SystemZ::R1H, SystemZ::R2H, SystemZ::R3H,
- SystemZ::R4H, SystemZ::R5H, SystemZ::R6H, SystemZ::R7H,
- SystemZ::R8H, SystemZ::R9H, SystemZ::R10H, SystemZ::R11H,
- SystemZ::R12H, SystemZ::R13H, SystemZ::R14H, SystemZ::R15H
-};
+ SystemZ::R0H, SystemZ::R1H, SystemZ::R2H, SystemZ::R3H,
+ SystemZ::R4H, SystemZ::R5H, SystemZ::R6H, SystemZ::R7H,
+ SystemZ::R8H, SystemZ::R9H, SystemZ::R10H, SystemZ::R11H,
+ SystemZ::R12H, SystemZ::R13H, SystemZ::R14H, SystemZ::R15H};
const unsigned SystemZMC::GR64Regs[16] = {
- SystemZ::R0D, SystemZ::R1D, SystemZ::R2D, SystemZ::R3D,
- SystemZ::R4D, SystemZ::R5D, SystemZ::R6D, SystemZ::R7D,
- SystemZ::R8D, SystemZ::R9D, SystemZ::R10D, SystemZ::R11D,
- SystemZ::R12D, SystemZ::R13D, SystemZ::R14D, SystemZ::R15D
-};
+ SystemZ::R0D, SystemZ::R1D, SystemZ::R2D, SystemZ::R3D,
+ SystemZ::R4D, SystemZ::R5D, SystemZ::R6D, SystemZ::R7D,
+ SystemZ::R8D, SystemZ::R9D, SystemZ::R10D, SystemZ::R11D,
+ SystemZ::R12D, SystemZ::R13D, SystemZ::R14D, SystemZ::R15D};
const unsigned SystemZMC::GR128Regs[16] = {
- SystemZ::R0Q, 0, SystemZ::R2Q, 0,
- SystemZ::R4Q, 0, SystemZ::R6Q, 0,
- SystemZ::R8Q, 0, SystemZ::R10Q, 0,
- SystemZ::R12Q, 0, SystemZ::R14Q, 0
-};
+ SystemZ::R0Q, 0, SystemZ::R2Q, 0, SystemZ::R4Q, 0, SystemZ::R6Q, 0,
+ SystemZ::R8Q, 0, SystemZ::R10Q, 0, SystemZ::R12Q, 0, SystemZ::R14Q, 0};
const unsigned SystemZMC::FP16Regs[16] = {
- SystemZ::F0H, SystemZ::F1H, SystemZ::F2H, SystemZ::F3H,
- SystemZ::F4H, SystemZ::F5H, SystemZ::F6H, SystemZ::F7H,
- SystemZ::F8H, SystemZ::F9H, SystemZ::F10H, SystemZ::F11H,
- SystemZ::F12H, SystemZ::F13H, SystemZ::F14H, SystemZ::F15H
-};
+ SystemZ::F0H, SystemZ::F1H, SystemZ::F2H, SystemZ::F3H,
+ SystemZ::F4H, SystemZ::F5H, SystemZ::F6H, SystemZ::F7H,
+ SystemZ::F8H, SystemZ::F9H, SystemZ::F10H, SystemZ::F11H,
+ SystemZ::F12H, SystemZ::F13H, SystemZ::F14H, SystemZ::F15H};
const unsigned SystemZMC::FP32Regs[16] = {
- SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S,
- SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S,
- SystemZ::F8S, SystemZ::F9S, SystemZ::F10S, SystemZ::F11S,
- SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S
-};
+ SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S,
+ SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S,
+ SystemZ::F8S, SystemZ::F9S, SystemZ::F10S, SystemZ::F11S,
+ SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S};
const unsigned SystemZMC::FP64Regs[16] = {
- SystemZ::F0D, SystemZ::F1D, SystemZ::F2D, SystemZ::F3D,
- SystemZ::F4D, SystemZ::F5D, SystemZ::F6D, SystemZ::F7D,
- SystemZ::F8D, SystemZ::F9D, SystemZ::F10D, SystemZ::F11D,
- SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, SystemZ::F15D
-};
+ SystemZ::F0D, SystemZ::F1D, SystemZ::F2D, SystemZ::F3D,
+ SystemZ::F4D, SystemZ::F5D, SystemZ::F6D, SystemZ::F7D,
+ SystemZ::F8D, SystemZ::F9D, SystemZ::F10D, SystemZ::F11D,
+ SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, SystemZ::F15D};
const unsigned SystemZMC::FP128Regs[16] = {
- SystemZ::F0Q, SystemZ::F1Q, 0, 0,
- SystemZ::F4Q, SystemZ::F5Q, 0, 0,
- SystemZ::F8Q, SystemZ::F9Q, 0, 0,
- SystemZ::F12Q, SystemZ::F13Q, 0, 0
-};
+ SystemZ::F0Q, SystemZ::F1Q, 0, 0, SystemZ::F4Q, SystemZ::F5Q, 0, 0,
+ SystemZ::F8Q, SystemZ::F9Q, 0, 0, SystemZ::F12Q, SystemZ::F13Q, 0, 0};
const unsigned SystemZMC::VR16Regs[32] = {
- SystemZ::F0H, SystemZ::F1H, SystemZ::F2H, SystemZ::F3H,
- SystemZ::F4H, SystemZ::F5H, SystemZ::F6H, SystemZ::F7H,
- SystemZ::F8H, SystemZ::F9H, SystemZ::F10H, SystemZ::F11H,
- SystemZ::F12H, SystemZ::F13H, SystemZ::F14H, SystemZ::F15H,
- SystemZ::F16H, SystemZ::F17H, SystemZ::F18H, SystemZ::F19H,
- SystemZ::F20H, SystemZ::F21H, SystemZ::F22H, SystemZ::F23H,
- SystemZ::F24H, SystemZ::F25H, SystemZ::F26H, SystemZ::F27H,
- SystemZ::F28H, SystemZ::F29H, SystemZ::F30H, SystemZ::F31H
-};
+ SystemZ::F0H, SystemZ::F1H, SystemZ::F2H, SystemZ::F3H, SystemZ::F4H,
+ SystemZ::F5H, SystemZ::F6H, SystemZ::F7H, SystemZ::F8H, SystemZ::F9H,
+ SystemZ::F10H, SystemZ::F11H, SystemZ::F12H, SystemZ::F13H, SystemZ::F14H,
+ SystemZ::F15H, SystemZ::F16H, SystemZ::F17H, SystemZ::F18H, SystemZ::F19H,
+ SystemZ::F20H, SystemZ::F21H, SystemZ::F22H, SystemZ::F23H, SystemZ::F24H,
+ SystemZ::F25H, SystemZ::F26H, SystemZ::F27H, SystemZ::F28H, SystemZ::F29H,
+ SystemZ::F30H, SystemZ::F31H};
const unsigned SystemZMC::VR32Regs[32] = {
- SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S,
- SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S,
- SystemZ::F8S, SystemZ::F9S, SystemZ::F10S, SystemZ::F11S,
- SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S,
- SystemZ::F16S, SystemZ::F17S, SystemZ::F18S, SystemZ::F19S,
- SystemZ::F20S, SystemZ::F21S, SystemZ::F22S, SystemZ::F23S,
- SystemZ::F24S, SystemZ::F25S, SystemZ::F26S, SystemZ::F27S,
- SystemZ::F28S, SystemZ::F29S, SystemZ::F30S, SystemZ::F31S
-};
+ SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S, SystemZ::F4S,
+ SystemZ::F5S, SystemZ::F6S, SystemZ::F7S, SystemZ::F8S, SystemZ::F9S,
+ SystemZ::F10S, SystemZ::F11S, SystemZ::F12S, SystemZ::F13S, SystemZ::F14S,
+ SystemZ::F15S, SystemZ::F16S, SystemZ::F17S, SystemZ::F18S, SystemZ::F19S,
+ SystemZ::F20S, SystemZ::F21S, SystemZ::F22S, SystemZ::F23S, SystemZ::F24S,
+ SystemZ::F25S, SystemZ::F26S, SystemZ::F27S, SystemZ::F28S, SystemZ::F29S,
+ SystemZ::F30S, SystemZ::F31S};
const unsigned SystemZMC::VR64Regs[32] = {
- SystemZ::F0D, SystemZ::F1D, SystemZ::F2D, SystemZ::F3D,
- SystemZ::F4D, SystemZ::F5D, SystemZ::F6D, SystemZ::F7D,
- SystemZ::F8D, SystemZ::F9D, SystemZ::F10D, SystemZ::F11D,
- SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, SystemZ::F15D,
- SystemZ::F16D, SystemZ::F17D, SystemZ::F18D, SystemZ::F19D,
- SystemZ::F20D, SystemZ::F21D, SystemZ::F22D, SystemZ::F23D,
- SystemZ::F24D, SystemZ::F25D, SystemZ::F26D, SystemZ::F27D,
- SystemZ::F28D, SystemZ::F29D, SystemZ::F30D, SystemZ::F31D
-};
+ SystemZ::F0D, SystemZ::F1D, SystemZ::F2D, SystemZ::F3D, SystemZ::F4D,
+ SystemZ::F5D, SystemZ::F6D, SystemZ::F7D, SystemZ::F8D, SystemZ::F9D,
+ SystemZ::F10D, SystemZ::F11D, SystemZ::F12D, SystemZ::F13D, SystemZ::F14D,
+ SystemZ::F15D, SystemZ::F16D, SystemZ::F17D, SystemZ::F18D, SystemZ::F19D,
+ SystemZ::F20D, SystemZ::F21D, SystemZ::F22D, SystemZ::F23D, SystemZ::F24D,
+ SystemZ::F25D, SystemZ::F26D, SystemZ::F27D, SystemZ::F28D, SystemZ::F29D,
+ SystemZ::F30D, SystemZ::F31D};
const unsigned SystemZMC::VR128Regs[32] = {
- SystemZ::V0, SystemZ::V1, SystemZ::V2, SystemZ::V3,
- SystemZ::V4, SystemZ::V5, SystemZ::V6, SystemZ::V7,
- SystemZ::V8, SystemZ::V9, SystemZ::V10, SystemZ::V11,
- SystemZ::V12, SystemZ::V13, SystemZ::V14, SystemZ::V15,
- SystemZ::V16, SystemZ::V17, SystemZ::V18, SystemZ::V19,
- SystemZ::V20, SystemZ::V21, SystemZ::V22, SystemZ::V23,
- SystemZ::V24, SystemZ::V25, SystemZ::V26, SystemZ::V27,
- SystemZ::V28, SystemZ::V29, SystemZ::V30, SystemZ::V31
-};
+ SystemZ::V0, SystemZ::V1, SystemZ::V2, SystemZ::V3, SystemZ::V4,
+ SystemZ::V5, SystemZ::V6, SystemZ::V7, SystemZ::V8, SystemZ::V9,
+ SystemZ::V10, SystemZ::V11, SystemZ::V12, SystemZ::V13, SystemZ::V14,
+ SystemZ::V15, SystemZ::V16, SystemZ::V17, SystemZ::V18, SystemZ::V19,
+ SystemZ::V20, SystemZ::V21, SystemZ::V22, SystemZ::V23, SystemZ::V24,
+ SystemZ::V25, SystemZ::V26, SystemZ::V27, SystemZ::V28, SystemZ::V29,
+ SystemZ::V30, SystemZ::V31};
const unsigned SystemZMC::AR32Regs[16] = {
- SystemZ::A0, SystemZ::A1, SystemZ::A2, SystemZ::A3,
- SystemZ::A4, SystemZ::A5, SystemZ::A6, SystemZ::A7,
- SystemZ::A8, SystemZ::A9, SystemZ::A10, SystemZ::A11,
- SystemZ::A12, SystemZ::A13, SystemZ::A14, SystemZ::A15
-};
+ SystemZ::A0, SystemZ::A1, SystemZ::A2, SystemZ::A3,
+ SystemZ::A4, SystemZ::A5, SystemZ::A6, SystemZ::A7,
+ SystemZ::A8, SystemZ::A9, SystemZ::A10, SystemZ::A11,
+ SystemZ::A12, SystemZ::A13, SystemZ::A14, SystemZ::A15};
const unsigned SystemZMC::CR64Regs[16] = {
- SystemZ::C0, SystemZ::C1, SystemZ::C2, SystemZ::C3,
- SystemZ::C4, SystemZ::C5, SystemZ::C6, SystemZ::C7,
- SystemZ::C8, SystemZ::C9, SystemZ::C10, SystemZ::C11,
- SystemZ::C12, SystemZ::C13, SystemZ::C14, SystemZ::C15
-};
+ SystemZ::C0, SystemZ::C1, SystemZ::C2, SystemZ::C3,
+ SystemZ::C4, SystemZ::C5, SystemZ::C6, SystemZ::C7,
+ SystemZ::C8, SystemZ::C9, SystemZ::C10, SystemZ::C11,
+ SystemZ::C12, SystemZ::C13, SystemZ::C14, SystemZ::C15};
unsigned SystemZMC::getFirstReg(unsigned Reg) {
static unsigned Map[SystemZ::NUM_TARGET_REGS];
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 8d4dc97f516824a..79babdca86cb5a6 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -138,6 +138,25 @@ static MCInst lowerSubvectorStore(const MachineInstr *MI, unsigned Opcode) {
.addImm(0);
}
+// MI extracts the first element of the source vector.
+static MCInst lowerVecEltExtraction(const MachineInstr *MI, unsigned Opcode) {
+ return MCInstBuilder(Opcode)
+ .addReg(SystemZMC::getRegAsGR64(MI->getOperand(0).getReg()))
+ .addReg(SystemZMC::getRegAsVR128(MI->getOperand(1).getReg()))
+ .addReg(0)
+ .addImm(0);
+}
+
+// MI inserts value into the first element of the destination vector.
+static MCInst lowerVecEltInsertion(const MachineInstr *MI, unsigned Opcode) {
+ return MCInstBuilder(Opcode)
+ .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+ .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+ .addReg(MI->getOperand(1).getReg())
+ .addReg(0)
+ .addImm(0);
+}
+
// The XPLINK ABI requires that a no-op encoding the call type is emitted after
// each call to a subroutine. This information can be used by the called
// function to determine its entry point, e.g. for generating a backtrace. The
@@ -571,18 +590,19 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
break;
case SystemZ::LFER:
- LoweredMI = MCInstBuilder(SystemZ::VLGVF)
- .addReg(SystemZMC::getRegAsGR64(MI->getOperand(0).getReg()))
- .addReg(SystemZMC::getRegAsVR128(MI->getOperand(1).getReg()))
- .addReg(0).addImm(0);
+ LoweredMI = lowerVecEltExtraction(MI, SystemZ::VLGVF);
+ break;
+
+ case SystemZ::LFER_16:
+ LoweredMI = lowerVecEltExtraction(MI, SystemZ::VLGVH);
break;
case SystemZ::LEFR:
- LoweredMI = MCInstBuilder(SystemZ::VLVGF)
- .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
- .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
- .addReg(MI->getOperand(1).getReg())
- .addReg(0).addImm(0);
+ LoweredMI = lowerVecEltInsertion(MI, SystemZ::VLVGF);
+ break;
+
+ case SystemZ::LEFR_16:
+ LoweredMI = lowerVecEltInsertion(MI, SystemZ::VLVGH);
break;
#define LOWER_LOW(NAME) \
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index fb159236ec5c2b3..f26d5556793b1cf 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -527,6 +527,8 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(Op, MVT::f16, Subtarget.hasVector() ? Legal : Custom);
setOperationAction(ISD::FP_ROUND, MVT::f16, LibCall);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, LibCall);
+ if (Subtarget.hasVector()) // f16 <-> i16 bitcasts.
+ setOperationAction(ISD::BITCAST, MVT::i16, Custom);
for (unsigned I = MVT::FIRST_FP_VALUETYPE;
I <= MVT::LAST_FP_VALUETYPE;
@@ -1343,7 +1345,9 @@ SystemZTargetLowering::getRegForInlineAsmConstraint(
case 'f': // Floating-point register
if (!useSoftFloat()) {
- if (VT.getSizeInBits() == 64)
+ if (VT.getSizeInBits() == 16)
+ return std::make_pair(0U, &SystemZ::FP16BitRegClass);
+ else if (VT.getSizeInBits() == 64)
return std::make_pair(0U, &SystemZ::FP64BitRegClass);
else if (VT.getSizeInBits() == 128)
return std::make_pair(0U, &SystemZ::FP128BitRegClass);
@@ -1353,6 +1357,8 @@ SystemZTargetLowering::getRegForInlineAsmConstraint(
case 'v': // Vector register
if (Subtarget.hasVector()) {
+ if (VT.getSizeInBits() == 16)
+ return std::make_pair(0U, &SystemZ::VR16BitRegClass);
if (VT.getSizeInBits() == 32)
return std::make_pair(0U, &SystemZ::VR32BitRegClass);
if (VT.getSizeInBits() == 64)
@@ -1388,6 +1394,9 @@ SystemZTargetLowering::getRegForInlineAsmConstraint(
if (useSoftFloat())
return std::make_pair(
0u, static_cast<const TargetRegisterClass *>(nullptr));
+ if (getVTSizeInBits() == 16)
+ return parseRegisterNumber(Constraint, &SystemZ::FP16BitRegClass,
+ SystemZMC::FP16Regs, 16);
if (getVTSizeInBits() == 32)
return parseRegisterNumber(Constraint, &SystemZ::FP32BitRegClass,
SystemZMC::FP32Regs, 16);
@@ -1401,6 +1410,9 @@ SystemZTargetLowering::getRegForInlineAsmConstraint(
if (!Subtarget.hasVector())
return std::make_pair(
0u, static_cast<const TargetRegisterClass *>(nullptr));
+ if (getVTSizeInBits() == 16)
+ return parseRegisterNumber(Constraint, &SystemZ::VR16BitRegClass,
+ SystemZMC::VR16Regs, 32);
if (getVTSizeInBits() == 32)
return parseRegisterNumber(Constraint, &SystemZ::VR32BitRegClass,
SystemZMC::VR32Regs, 32);
@@ -6159,7 +6171,7 @@ SDValue SystemZTargetLowering::lowerFP_EXTEND(SDValue Op,
SelectionDAG &DAG) const {
SDValue In = Op.getOperand(Op->isStrictFPOpcode() ? 1 : 0);
if (In.getSimpleValueType() != MVT::f16)
- return Op; // Legal
+ return Op; // Legal
return SDValue(); // Let legalizer emit the libcall.
}
@@ -6179,18 +6191,18 @@ SDValue SystemZTargetLowering::lowerLoadF16(SDValue Op,
} else {
LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
assert(EVT(RegVT) == Ld->getMemoryVT() && "Unhandled f16 load");
- NewLd = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Ld->getChain(),
- Ld->getBasePtr(), Ld->getPointerInfo(),
- MVT::i16, Ld->getOriginalAlign(),
- Ld->getMemOperand()->getFlags());
+ NewLd =
+ DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Ld->getChain(),
+ Ld->getBasePtr(), Ld->getPointerInfo(), MVT::i16,
+ Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
}
// Load as integer, shift and then insert into upper 2 bytes of the FP
// register.
SDValue Shft = DAG.getNode(ISD::SHL, DL, MVT::i32, NewLd,
DAG.getConstant(16, DL, MVT::i32));
SDValue BCast = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Shft);
- SDValue F16Val = DAG.getTargetExtractSubreg(SystemZ::subreg_h16,
- DL, MVT::f16, BCast);
+ SDValue F16Val =
+ DAG.getTargetExtractSubreg(SystemZ::subreg_h16, DL, MVT::f16, BCast);
return DAG.getMergeValues({F16Val, NewLd.getValue(1)}, DL);
}
@@ -6203,19 +6215,20 @@ SDValue SystemZTargetLowering::lowerStoreF16(SDValue Op,
// Move into a GPR, shift and store the 2 bytes.
SDLoc DL(Op);
SDNode *U32 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f32);
- SDValue In32 = DAG.getTargetInsertSubreg(SystemZ::subreg_h16, DL,
- MVT::f32, SDValue(U32, 0), StoredVal);
+ SDValue In32 = DAG.getTargetInsertSubreg(SystemZ::subreg_h16, DL, MVT::f32,
+ SDValue(U32, 0), StoredVal);
SDValue BCast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, In32);
SDValue Shft = DAG.getNode(ISD::SRL, DL, MVT::i32, BCast,
DAG.getConstant(16, DL, MVT::i32));
if (auto *AtomicSt = dyn_cast<AtomicSDNode>(Op.getNode()))
return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MVT::i16, AtomicSt->getChain(),
- Shft, AtomicSt->getBasePtr(), AtomicSt->getMemOperand());
+ Shft, AtomicSt->getBasePtr(),
+ AtomicSt->getMemOperand());
StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
- return DAG.getTruncStore(St->getChain(), DL, Shft, St->getBasePtr(),
- MVT::i16, St->getMemOperand());
+ return DAG.getTruncStore(St->getChain(), DL, Shft, St->getBasePtr(), MVT::i16,
+ St->getMemOperand());
}
SDValue SystemZTargetLowering::lowerIS_FPCLASS(SDValue Op,
@@ -6458,8 +6471,7 @@ static SDValue expandBitCastF128ToI128(SelectionDAG &DAG, SDValue Src,
return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i128, Lo, Hi);
}
-// Lower operations with invalid operand or result types (currently used
-// only for 128-bit integer types).
+// Lower operations with invalid operand or result types.
void
SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
SmallVectorImpl<SDValue> &Results,
@@ -6519,11 +6531,20 @@ SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
break;
}
case ISD::BITCAST: {
+ SDLoc DL(N);
SDValue Src = N->getOperand(0);
- if (N->getValueType(0) == MVT::i128 && Src.getValueType() == MVT::f128 &&
- !useSoftFloat()) {
- SDLoc DL(N);
+ EVT SrcVT = Src.getValueType();
+ EVT ResVT = N->getValueType(0);
+ if (ResVT == MVT::i128 && SrcVT == MVT::f128 && !useSoftFloat())
Results.push_back(expandBitCastF128ToI128(DAG, Src, DL));
+ else if (SrcVT == MVT::i16 && ResVT == MVT::f16) {
+ SDValue In32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
+ Results.push_back(
+ SDValue(DAG.getMachineNode(SystemZ::LEFR_16, DL, MVT::f16, In32), 0));
+ } else if (SrcVT == MVT::f16 && ResVT == MVT::i16) {
+ SDValue ExtractedI32 =
+ SDValue(DAG.getMachineNode(SystemZ::LFER_16, DL, MVT::i32, Src), 0);
+ Results.push_back(DAG.getZExtOrTrunc(ExtractedI32, DL, ResVT));
}
break;
}
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
index 5b4b73d586a7962..36e975a233b395a 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -50,9 +50,9 @@ def LXR : UnaryRRE<"lxr", 0xB365, null_frag, FP128, FP128>;
// For z13 we prefer LDR over LER to avoid partial register dependencies.
let isCodeGenOnly = 1 in {
- def LER16 : UnaryRR <"ler", 0x38, null_frag, FP16, FP16>;
- def LDR16 : UnaryRR<"ldr", 0x28, null_frag, FP16, FP16>;
- def LDR32 : UnaryRR<"ldr", 0x28, null_frag, FP32, FP32>;
+ def LER16 : UnaryRR<"ler", 0x38, null_frag, FP16, FP16>;
+ def LDR16 : UnaryRR<"ldr", 0x28, null_frag, FP16, FP16>;
+ def LDR32 : UnaryRR<"ldr", 0x28, null_frag, FP32, FP32>;
}
// Moves between two floating-point registers that also set the condition
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 470543824dc5d06..b0c0d76faa4c463 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -1007,16 +1007,16 @@ void SystemZInstrInfo::storeRegToStackSlot(
Register GR64Reg = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
Register FP64Reg = MRI.createVirtualRegister(&SystemZ::FP64BitRegClass);
BuildMI(MBB, MBBI, DL, get(SystemZ::COPY))
- .addReg(FP64Reg, RegState::DefineNoRead, SystemZ::subreg_h16)
- .addReg(SrcReg, getKillRegState(isKill));
+ .addReg(FP64Reg, RegState::DefineNoRead, SystemZ::subreg_h16)
+ .addReg(SrcReg, getKillRegState(isKill));
BuildMI(MBB, MBBI, DL, get(SystemZ::LGDR), GR64Reg)
- .addReg(FP64Reg, RegState::Kill);
+ .addReg(FP64Reg, RegState::Kill);
BuildMI(MBB, MBBI, DL, get(SystemZ::SRLG), GR64Reg)
- .addReg(GR64Reg)
- .addReg(0)
- .addImm(48);
+ .addReg(GR64Reg)
+ .addReg(0)
+ .addImm(48);
addFrameReference(BuildMI(MBB, MBBI, DL, get(SystemZ::STH))
- .addReg(GR64Reg, RegState::Kill, SystemZ::subreg_l32),
+ .addReg(GR64Reg, RegState::Kill, SystemZ::subreg_l32),
FrameIdx);
return;
}
@@ -1046,18 +1046,18 @@ void SystemZInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
"Expected non-SSA form with virtual registers.");
Register GR64Reg = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
Register FP64Reg = MRI.createVirtualRegister(&SystemZ::FP64BitRegClass);
- addFrameReference(BuildMI(MBB, MBBI, DL, get(SystemZ::LH))
- .addReg(GR64Reg, RegState::DefineNoRead,
- SystemZ::subreg_l32),
- FrameIdx);
+ addFrameReference(
+ BuildMI(MBB, MBBI, DL, get(SystemZ::LH))
+ .addReg(GR64Reg, RegState::DefineNoRead, SystemZ::subreg_l32),
+ FrameIdx);
BuildMI(MBB, MBBI, DL, get(SystemZ::SLLG), GR64Reg)
- .addReg(GR64Reg)
- .addReg(0)
- .addImm(48);
+ .addReg(GR64Reg)
+ .addReg(0)
+ .addImm(48);
BuildMI(MBB, MBBI, DL, get(SystemZ::LDGR), FP64Reg)
- .addReg(GR64Reg, RegState::Kill);
+ .addReg(GR64Reg, RegState::Kill);
BuildMI(MBB, MBBI, DL, get(SystemZ::COPY), DestReg)
- .addReg(FP64Reg, RegState::Kill, SystemZ::subreg_h16);
+ .addReg(FP64Reg, RegState::Kill, SystemZ::subreg_h16);
return;
}
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index 7b6e4deed18ef64..140fa0a05268b60 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -1790,6 +1790,8 @@ let Predicates = [FeatureVector] in {
def : Pat<(f32 (bitconvert (i32 GR32:$src))), (LEFR GR32:$src)>;
def : Pat<(i32 (bitconvert (f32 VR32:$src))),
(EXTRACT_SUBREG (LFER VR32:$src), subreg_l32)>;
+ def LEFR_16 : UnaryAliasVRS<VR16, GR32>;
+ def LFER_16 : UnaryAliasVRS<GR32, VR16>;
}
// Floating-point values are stored in element 0 of the corresponding
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
index 6c1d1df83fafa3f..c449b2d400777ee 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -1376,8 +1376,8 @@ def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)DB$")>;
// Vector: Floating-point insertion and extraction
//===----------------------------------------------------------------------===//
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR$")>;
-def : InstRW<[WLat4, FXb, NormalGr], (instregex "LFER$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR(_16)?$")>;
+def : InstRW<[WLat4, FXb, NormalGr], (instregex "LFER(_16)?$")>;
//===----------------------------------------------------------------------===//
// Vector: String instructions
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
index c47fcb7cb0a11b9..b17c6c6e611b100 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
@@ -1448,8 +1448,8 @@ def : InstRW<[WLat3, VecDFX, NormalGr], (instregex "WF(C|K)XB$")>;
// Vector: Floating-point insertion and extraction
//===----------------------------------------------------------------------===//
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR$")>;
-def : InstRW<[WLat3, FXb, NormalGr], (instregex "LFER$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR(_16)?$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "LFER(_16)?$")>;
//===----------------------------------------------------------------------===//
// Vector: String instructions
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
index 28d34d80adb812a..3d0bb0ed902265f 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
@@ -1491,8 +1491,8 @@ def : InstRW<[WLat3, VecDFX, NormalGr], (instregex "WF(C|K)XB$")>;
// Vector: Floating-point insertion and extraction
//===----------------------------------------------------------------------===//
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR$")>;
-def : InstRW<[WLat3, FXb, NormalGr], (instregex "LFER$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR(_16)?$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "LFER(_16)?$")>;
//===----------------------------------------------------------------------===//
// Vector: String instructions
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
index 24713b8fc93b56f..51b422f5f533736 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
@@ -1499,8 +1499,8 @@ def : InstRW<[WLat3, VecDFX, NormalGr], (instregex "WF(C|K)XB$")>;
// Vector: Floating-point insertion and extraction
//===----------------------------------------------------------------------===//
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR$")>;
-def : InstRW<[WLat3, FXb, NormalGr], (instregex "LFER$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR(_16)?$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "LFER(_16)?$")>;
//===----------------------------------------------------------------------===//
// Vector: String instructions
diff --git a/llvm/test/CodeGen/SystemZ/asm-10.ll b/llvm/test/CodeGen/SystemZ/asm-10.ll
index b71db8350781de8..8226b8a1a2d2596 100644
--- a/llvm/test/CodeGen/SystemZ/asm-10.ll
+++ b/llvm/test/CodeGen/SystemZ/asm-10.ll
@@ -2,6 +2,15 @@
;
; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
+define half @f0() {
+; CHECK-LABEL: f0:
+; CHECK: lzer %f1
+; CHECK: blah %f0 %f1
+; CHECK: br %r14
+ %val = call half asm "blah $0 $1", "=&f,f" (half 0.0)
+ ret half %val
+}
+
define float @f1() {
; CHECK-LABEL: f1:
; CHECK: lzer %f1
diff --git a/llvm/test/CodeGen/SystemZ/asm-17.ll b/llvm/test/CodeGen/SystemZ/asm-17.ll
index c9c4d73c66ebb5b..dad75d4d012d1e3 100644
--- a/llvm/test/CodeGen/SystemZ/asm-17.ll
+++ b/llvm/test/CodeGen/SystemZ/asm-17.ll
@@ -25,6 +25,17 @@ define i64 @f2() {
ret i64 %ret
}
+; Test 16-bit FPRs.
+define half @f3_half() {
+; CHECK-LABEL: f3_half:
+; CHECK: lzer %f4
+; CHECK: blah %f4
+; CHECK: ler %f0, %f4
+; CHECK: br %r14
+ %ret = call half asm "blah $0", "={f4},0" (half 0.0)
+ ret half %ret
+}
+
; Test i32 FPRs.
define float @f3() {
; CHECK-LABEL: f3:
diff --git a/llvm/test/CodeGen/SystemZ/asm-19.ll b/llvm/test/CodeGen/SystemZ/asm-19.ll
index e16fdfa13fce6a9..6c77fb55071cabb 100644
--- a/llvm/test/CodeGen/SystemZ/asm-19.ll
+++ b/llvm/test/CodeGen/SystemZ/asm-19.ll
@@ -3,6 +3,15 @@
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -no-integrated-as | FileCheck %s
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 -no-integrated-as | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-Z14
+define half @f0() {
+; CHECK-LABEL: f0:
+; CHECK: lzer %f1
+; CHECK: blah %f0 %f1
+; CHECK: br %r14
+ %val = call half asm "blah $0 $1", "=&v,v" (half 0.0)
+ ret half %val
+}
+
define float @f1() {
; CHECK-LABEL: f1:
; CHECK: lzer %f1
@@ -86,6 +95,16 @@ define <4 x float> @f9() {
ret <4 x float> %val
}
+define half @f10_half() {
+; CHECK-LABEL: f10_half:
+; CHECK: lzer %f4
+; CHECK: blah %f4
+; CHECK: ldr %f0, %f4
+; CHECK: br %r14
+ %ret = call half asm "blah $0", "={v4},0" (half 0.0)
+ ret half %ret
+}
+
define float @f10() {
; CHECK-LABEL: f10:
; CHECK: lzer %f4
diff --git a/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll b/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll
index c1773abe92305d2..d3d641357ae588a 100644
--- a/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll
@@ -227,6 +227,38 @@ exit:
ret float %add
}
+define half @f12_half(half %dummy, half %val, ptr %dest) {
+; CHECK-LABEL: f12_half:
+; CHECK: ler %f8, %f2
+; CHECK-NEXT: ler %f0, %f2
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %f0
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ltebr %f0, %f0
+; CHECK-NEXT: jl .LBB11_2
+; CHECK-NEXT:# %bb.1:
+; CHECK-NEXT: lgdr %r0, %f8
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 0(%r13)
+; CHECK-NEXT:.LBB11_2:
+; CHECK-NEXT: ler %f0, %f8
+; CHECK-NEXT: ld %f8, 160(%r15)
+; CHECK-NEXT: lmg %r13, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+entry:
+ call void asm sideeffect "blah $0", "{f0}"(half %val)
+ %cmp = fcmp olt half %val, 0.0
+ br i1 %cmp, label %exit, label %store
+
+store:
+ store half %val, ptr %dest
+ br label %exit
+
+exit:
+ ret half %val
+}
+
; %val in %f2 must be preserved during comparison and also copied to %f0.
define float @f12(float %dummy, float %val, ptr %dest) {
; CHECK-LABEL: f12:
@@ -304,6 +336,38 @@ exit:
ret void
}
+define half @f15_half(half %val, half %dummy, ptr %dest) {
+; CHECK-LABEL: f15_half:
+; CHECK: ler %f8, %f0
+; CHECK-NEXT: ler %f2, %f0
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %f2
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ltebr %f0, %f0
+; CHECK-NEXT: jl .LBB15_2
+; CHECK-NEXT:# %bb.1:
+; CHECK-NEXT: lgdr %r0, %f8
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 0(%r13)
+; CHECK-NEXT:.LBB15_2:
+; CHECK-NEXT: ler %f0, %f8
+; CHECK-NEXT: ld %f8, 160(%r15)
+; CHECK-NEXT: lmg %r13, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+entry:
+ call void asm sideeffect "blah $0", "{f2}"(half %val)
+ %cmp = fcmp olt half %val, 0.0
+ br i1 %cmp, label %exit, label %store
+
+store:
+ store half %val, ptr %dest
+ br label %exit
+
+exit:
+ ret half %val
+}
+
define float @f15(float %val, float %dummy, ptr %dest) {
; CHECK-LABEL: f15:
; CHECK: ltebr %f1, %f0
@@ -374,7 +438,7 @@ define float @f18(float %dummy, float %a, ptr %dest) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lnebr %f0, %f2
; CHECK-NEXT: blr %r14
-; CHECK-NEXT: .LBB17_1: # %store
+; CHECK-NEXT: .LBB19_1: # %store
; CHECK-NEXT: ste %f0, 0(%r2)
; CHECK-NEXT: br %r14
entry:
@@ -397,7 +461,7 @@ define float @f19(float %dummy, float %a, ptr %dest) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lcebr %f0, %f2
; CHECK-NEXT: bler %r14
-; CHECK-NEXT: .LBB18_1: # %store
+; CHECK-NEXT: .LBB20_1: # %store
; CHECK-NEXT: ste %f0, 0(%r2)
; CHECK-NEXT: br %r14
entry:
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-move.ll b/llvm/test/CodeGen/SystemZ/fp-half-move.ll
new file mode 100644
index 000000000000000..3a0add0afcdbe89
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-half-move.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=NOVEC
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=VECTOR
+;
+; Test moves between i16 and half.
+
+define half @f1(ptr %ptr) {
+; CHECK-LABEL: f1:
+; NOVEC-LABEL: f1:
+; NOVEC: # %bb.0:
+; NOVEC-NEXT: aghi %r15, -168
+; NOVEC-NEXT: .cfi_def_cfa_offset 328
+; NOVEC-NEXT: llh %r0, 0(%r2)
+; NOVEC-NEXT: oill %r0, 255
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: aghi %r15, 168
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: f1:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: lh %r0, 0(%r2)
+; VECTOR-NEXT: oill %r0, 255
+; VECTOR-NEXT: vlvgh %v0, %r0, 0
+; VECTOR-NEXT: br %r14
+ %L = load i16, ptr %ptr
+ %O = or i16 %L, 255
+ %res = bitcast i16 %O to half
+ ret half %res
+}
+
+define void @f2(half %val, ptr %ptr) {
+; CHECK-LABEL: f1:
+; NOVEC-LABEL: f2:
+; NOVEC: # %bb.0:
+; NOVEC-NEXT: aghi %r15, -168
+; NOVEC-NEXT: .cfi_def_cfa_offset 328
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: stc %r0, 0(%r2)
+; NOVEC-NEXT: aghi %r15, 168
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: f2:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: vlgvh %r0, %v0, 0
+; VECTOR-NEXT: stc %r0, 0(%r2)
+; VECTOR-NEXT: br %r14
+ %res = bitcast half %val to i16
+ %trunc = trunc i16 %res to i8
+ store i8 %trunc, ptr %ptr
+ ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-cmp-04.ll b/llvm/test/CodeGen/SystemZ/fp-strict-cmp-04.ll
index bf9ccbcd70550e2..dfefc43c02bed67 100644
--- a/llvm/test/CodeGen/SystemZ/fp-strict-cmp-04.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-strict-cmp-04.ll
@@ -298,6 +298,43 @@ exit:
ret float %add
}
+define half @f12_half(half %dummy, half %val) #0 {
+; CHECK-LABEL: f12_half:
+; CHECK: ler %f9, %f2
+; CHECK-NEXT: ler %f0, %f2
+; CHECK-NEXT: #APP
+; CHECK-NEXT: ler %f8, %f0
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lzer %f0
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f10, %f0
+; CHECK-NEXT: ler %f0, %f9
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: cebr %f0, %f10
+; CHECK-NEXT: jl .LBB11_2
+; CHECK-NEXT:# %bb.1: # %store
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT:.LBB11_2: # %exit
+; CHECK-NEXT: ler %f0, %f8
+; CHECK: br %r14
+entry:
+ %ret = call half asm "ler $0, $1", "=f,{f0}"(half %val) #0
+ %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(
+ half %val, half 0.0,
+ metadata !"olt",
+ metadata !"fpexcept.strict") #0
+ br i1 %cmp, label %exit, label %store
+
+store:
+ call void asm sideeffect "blah", ""() #0
+ br label %exit
+
+exit:
+ ret half %ret
+}
+
; Test that LER does not get converted to LTEBR as %f0 is live after it.
define float @f12(float %dummy, float %val) #0 {
; CHECK-LABEL: f12:
@@ -309,7 +346,7 @@ define float @f12(float %dummy, float %val) #0 {
; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
- %ret = call float asm "blah $1", "=f,{f0}"(float %val) #0
+ %ret = call float asm "$0 = blah $1", "=f,{f0}"(float %val) #0
%cmp = call i1 @llvm.experimental.constrained.fcmp.f32(
float %val, float 0.0,
metadata !"olt",
@@ -384,6 +421,43 @@ exit:
ret void
}
+define half @f15_half(half %val, half %dummy) #0 {
+; CHECK-LABEL: f15_half:
+; CHECK: ler %f9, %f0
+; CHECK-NEXT: ler %f2, %f0
+; CHECK-NEXT: #APP
+; CHECK-NEXT: ler %f8, %f2
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lzer %f0
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f10, %f0
+; CHECK-NEXT: ler %f0, %f9
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: cebr %f0, %f10
+; CHECK-NEXT: jl .LBB15_2
+; CHECK-NEXT:# %bb.1: # %store
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT:.LBB15_2: # %exit
+; CHECK-NEXT: ler %f0, %f8
+; CHECK: br %r14
+entry:
+ %ret = call half asm "ler $0, $1", "=f,{f2}"(half %val) #0
+ %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(
+ half %val, half 0.0,
+ metadata !"olt",
+ metadata !"fpexcept.strict") #0
+ br i1 %cmp, label %exit, label %store
+
+store:
+ call void asm sideeffect "blah", ""() #0
+ br label %exit
+
+exit:
+ ret half %ret
+}
+
; Test a case where it is the source rather than destination of LER that
; we need, but cannot convert the LER.
define float @f15(float %val, float %dummy) #0 {
@@ -491,6 +565,43 @@ exit:
ret float %res
}
+define half @f19_half(half %dummy, half %val) #0 {
+; CHECK-LABEL: f19_half:
+; CHECK: ler %f9, %f2
+; CHECK-NEXT: ler %f0, %f2
+; CHECK-NEXT: #APP
+; CHECK-NEXT: ler %f8, %f0
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lzer %f0
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f10, %f0
+; CHECK-NEXT: ler %f0, %f9
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: cebr %f0, %f10
+; CHECK-NEXT: jl .LBB20_2
+; CHECK-NEXT:# %bb.1: # %store
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT:.LBB20_2: # %exit
+; CHECK-NEXT: ler %f0, %f8
+; CHECK: br %r14
+entry:
+ %ret = call half asm sideeffect "ler $0, $1", "=f,{f0}"(half %val) #0
+ %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(
+ half %val, half 0.0,
+ metadata !"olt",
+ metadata !"fpexcept.strict") #0
+ br i1 %cmp, label %exit, label %store
+
+store:
+ call void asm sideeffect "blah", ""() #0
+ br label %exit
+
+exit:
+ ret half %ret
+}
+
; Verify that we cannot convert LER to LTEBR and omit the compare if
; there may be an intervening change to the exception flags.
define float @f19(float %dummy, float %val) #0 {
@@ -524,6 +635,7 @@ declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, me
declare float @llvm.experimental.constrained.fsub.f32(float, float, metadata, metadata)
declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata)
declare float @llvm.experimental.constrained.fdiv.f32(float, float, metadata, metadata)
+declare i1 @llvm.experimental.constrained.fcmp.f16(half, half, metadata, metadata)
declare i1 @llvm.experimental.constrained.fcmp.f32(float, float, metadata, metadata)
declare i1 @llvm.experimental.constrained.fcmp.f64(double, double, metadata, metadata)
declare i1 @llvm.experimental.constrained.fcmp.f128(fp128, fp128, metadata, metadata)
diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-cmps-04.ll b/llvm/test/CodeGen/SystemZ/fp-strict-cmps-04.ll
index e178769f263e691..ad86df175319263 100644
--- a/llvm/test/CodeGen/SystemZ/fp-strict-cmps-04.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-strict-cmps-04.ll
@@ -110,6 +110,43 @@ exit:
ret float %res
}
+define half @f12_half(half %dummy, half %val) #0 {
+; CHECK-LABEL: f12_half:
+; CHECK: ler %f9, %f2
+; CHECK-NEXT: ler %f0, %f2
+; CHECK-NEXT: #APP
+; CHECK-NEXT: ler %f8, %f0
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lzer %f0
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f10, %f0
+; CHECK-NEXT: ler %f0, %f9
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: kebr %f0, %f10
+; CHECK-NEXT: jl .LBB4_2
+; CHECK-NEXT:# %bb.1: # %store
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT:.LBB4_2: # %exit
+; CHECK-NEXT: ler %f0, %f8
+; CHECK: br %r14
+entry:
+ %ret = call half asm "ler $0, $1", "=f,{f0}"(half %val) #0
+ %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(
+ half %val, half 0.0,
+ metadata !"olt",
+ metadata !"fpexcept.strict") #0
+ br i1 %cmp, label %exit, label %store
+
+store:
+ call void asm sideeffect "blah", ""() #0
+ br label %exit
+
+exit:
+ ret half %ret
+}
+
; Test that LER does not get converted to LTEBR.
define float @f12(float %dummy, float %val) #0 {
; CHECK-LABEL: f12:
diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs-zEC12.ll b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs-zEC12.ll
index 44175f924f7fc0d..8b6ab6f4dff7eef 100644
--- a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs-zEC12.ll
+++ b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs-zEC12.ll
@@ -3,6 +3,32 @@
;
; Test inline assembly where the operand is bitcasted.
+define signext i16 @short_and_f(i16 signext %cc_dep1) {
+; CHECK-LABEL: short_and_f:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: aghi %r15, -168
+; CHECK-NEXT: .cfi_def_cfa_offset 328
+; CHECK-NEXT: sth %r2, 164(%r15)
+; CHECK-NEXT: lh %r0, 164(%r15)
+; CHECK-NEXT: sll %r0, 16
+; CHECK-NEXT: risbhg %r0, %r0, 0, 159, 32
+; CHECK-NEXT: ldgr %f1, %r0
+; CHECK-NEXT: # kill: def $f1h killed $f1h killed $f1d
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: # kill: def $f1h killed $f1h def $f1d
+; CHECK-NEXT: lgdr %r0, %f1
+; CHECK-NEXT: risblg %r0, %r0, 0, 159, 32
+; CHECK-NEXT: srl %r0, 16
+; CHECK-NEXT: sth %r0, 166(%r15)
+; CHECK-NEXT: lgh %r2, 166(%r15)
+; CHECK-NEXT: aghi %r15, 168
+; CHECK-NEXT: br %r14
+entry:
+ %0 = tail call i16 asm sideeffect "", "={f1},0"(i16 %cc_dep1)
+ ret i16 %0
+}
+
define signext i32 @int_and_f(i32 signext %cc_dep1) {
; CHECK-LABEL: int_and_f:
; CHECK: # %bb.0: # %entry
@@ -51,6 +77,32 @@ entry:
ret void
}
+define half @half_and_r(half %cc_dep1) {
+; CHECK-LABEL: half_and_r:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: aghi %r15, -168
+; CHECK-NEXT: .cfi_def_cfa_offset 328
+; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d
+; CHECK-NEXT: lgdr %r0, %f0
+; CHECK-NEXT: risblg %r0, %r0, 0, 159, 32
+; CHECK-NEXT: srl %r0, 16
+; CHECK-NEXT: sth %r0, 166(%r15)
+; CHECK-NEXT: lgh %r2, 166(%r15)
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: sth %r2, 164(%r15)
+; CHECK-NEXT: lh %r0, 164(%r15)
+; CHECK-NEXT: sll %r0, 16
+; CHECK-NEXT: risbhg %r0, %r0, 0, 159, 32
+; CHECK-NEXT: ldgr %f0, %r0
+; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; CHECK-NEXT: aghi %r15, 168
+; CHECK-NEXT: br %r14
+entry:
+ %0 = tail call half asm sideeffect "", "={r2},0"(half %cc_dep1)
+ ret half %0
+}
+
define float @float_and_r(float %cc_dep1) {
; CHECK-LABEL: float_and_r:
; CHECK: # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs.ll b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs.ll
index 3cbf3d21dec5a12..cf4dbbff8bec0b4 100644
--- a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs.ll
+++ b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs.ll
@@ -4,6 +4,20 @@
;
; Test inline assembly where the operand is bitcasted.
+define signext i16 @short_and_f(i16 signext %cc_dep1) {
+; CHECK-LABEL: short_and_f:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vlvgh %v0, %r2, 0
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vlgvh %r0, %v0, 0
+; CHECK-NEXT: lghr %r2, %r0
+; CHECK-NEXT: br %r14
+entry:
+ %0 = tail call i16 asm sideeffect "", "={f0},0"(i16 %cc_dep1)
+ ret i16 %0
+}
+
define signext i32 @int_and_f(i32 signext %cc_dep1) {
; CHECK-LABEL: int_and_f:
; CHECK: # %bb.0: # %entry
@@ -101,6 +115,19 @@ entry:
ret void
}
+define half @half_and_r(half %cc_dep1) {
+; CHECK-LABEL: half_and_r:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vlgvh %r0, %v0, 0
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vlvgh %v0, %r0, 0
+; CHECK-NEXT: br %r14
+entry:
+ %0 = tail call half asm sideeffect "", "={r0},0"(half %cc_dep1)
+ ret half %0
+}
+
define float @float_and_r(float %cc_dep1) {
; CHECK-LABEL: float_and_r:
; CHECK: # %bb.0: # %entry
@@ -145,6 +172,19 @@ entry:
ret void
}
+define half @half_and_v(half %cc_dep1) {
+; CHECK-LABEL: half_and_v:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ldr %f3, %f0
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: ldr %f0, %f3
+; CHECK-NEXT: br %r14
+entry:
+ %0 = tail call half asm sideeffect "", "={v3},0"(half %cc_dep1)
+ ret half %0
+}
+
define float @float_and_v(float %cc_dep1) {
; CHECK-LABEL: float_and_v:
; CHECK: # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-zEC12.ll b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-zEC12.ll
index 1ef6eece80acb1e..46a966f0b64e4dd 100644
--- a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-zEC12.ll
+++ b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-zEC12.ll
@@ -3,6 +3,30 @@
;
; Test inline assembly where the operand is bitcasted.
+define signext i16 @short_and_f(i16 signext %cc_dep1) {
+; CHECK-LABEL: short_and_f:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: aghi %r15, -168
+; CHECK-NEXT: .cfi_def_cfa_offset 328
+; CHECK-NEXT: sth %r2, 164(%r15)
+; CHECK-NEXT: lh %r0, 164(%r15)
+; CHECK-NEXT: sll %r0, 16
+; CHECK-NEXT: risbhg %r0, %r0, 0, 159, 32
+; CHECK-NEXT: ldgr %f0, %r0
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lgdr %r0, %f0
+; CHECK-NEXT: risblg %r0, %r0, 0, 159, 32
+; CHECK-NEXT: srl %r0, 16
+; CHECK-NEXT: sth %r0, 166(%r15)
+; CHECK-NEXT: lgh %r2, 166(%r15)
+; CHECK-NEXT: aghi %r15, 168
+; CHECK-NEXT: br %r14
+entry:
+ %0 = tail call i16 asm sideeffect "", "=f,0"(i16 %cc_dep1)
+ ret i16 %0
+}
+
define signext i32 @int_and_f(i32 signext %cc_dep1) {
; CHECK-LABEL: int_and_f:
; CHECK: # %bb.0: # %entry
@@ -49,6 +73,28 @@ entry:
ret void
}
+define half @half_and_r(half %cc_dep1) {
+; CHECK-LABEL: half_and_r:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: aghi %r15, -168
+; CHECK-NEXT: .cfi_def_cfa_offset 328
+; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d
+; CHECK-NEXT: lgdr %r0, %f0
+; CHECK-NEXT: risblg %r0, %r0, 0, 159, 32
+; CHECK-NEXT: srl %r0, 16
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: sll %r0, 16
+; CHECK-NEXT: risbhg %r0, %r0, 0, 159, 32
+; CHECK-NEXT: ldgr %f0, %r0
+; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; CHECK-NEXT: aghi %r15, 168
+; CHECK-NEXT: br %r14
+entry:
+ %0 = tail call half asm sideeffect "", "=r,0"(half %cc_dep1)
+ ret half %0
+}
+
define float @float_and_r(float %cc_dep1) {
; CHECK-LABEL: float_and_r:
; CHECK: # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting.ll b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting.ll
index 23d78a9315b404e..b23b40e0f0e907b 100644
--- a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting.ll
+++ b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting.ll
@@ -4,6 +4,20 @@
;
; Test inline assembly where the operand is bitcasted.
+define signext i16 @short_and_f(i16 signext %cc_dep1) {
+; CHECK-LABEL: short_and_f:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vlvgh %v0, %r2, 0
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vlgvh %r0, %v0, 0
+; CHECK-NEXT: lghr %r2, %r0
+; CHECK-NEXT: br %r14
+entry:
+ %0 = tail call i16 asm sideeffect "", "=f,0"(i16 %cc_dep1)
+ ret i16 %0
+}
+
define signext i32 @int_and_f(i32 signext %cc_dep1) {
; CHECK-LABEL: int_and_f:
; CHECK: # %bb.0: # %entry
@@ -58,6 +72,20 @@ entry:
ret void
}
+define signext i16 @short_and_v(i16 signext %cc_dep1) {
+; CHECK-LABEL: short_and_v:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vlvgh %v0, %r2, 0
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vlgvh %r0, %v0, 0
+; CHECK-NEXT: lghr %r2, %r0
+; CHECK-NEXT: br %r14
+entry:
+ %0 = tail call i16 asm sideeffect "", "=v,0"(i16 %cc_dep1)
+ ret i16 %0
+}
+
define signext i32 @int_and_v(i32 signext %cc_dep1) {
; CHECK-LABEL: int_and_v:
; CHECK: # %bb.0: # %entry
@@ -100,6 +128,19 @@ entry:
ret void
}
+define half @half_and_r(half %cc_dep1) {
+; CHECK-LABEL: half_and_r:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vlgvh %r0, %v0, 0
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vlvgh %v0, %r0, 0
+; CHECK-NEXT: br %r14
+entry:
+ %0 = tail call half asm sideeffect "", "=r,0"(half %cc_dep1)
+ ret half %0
+}
+
define float @float_and_r(float %cc_dep1) {
; CHECK-LABEL: float_and_r:
; CHECK: # %bb.0: # %entry
@@ -143,6 +184,17 @@ entry:
ret void
}
+define half @half_and_v(half %cc_dep1) {
+; CHECK-LABEL: half_and_v:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: br %r14
+entry:
+ %0 = tail call half asm sideeffect "", "=v,0"(half %cc_dep1)
+ ret half %0
+}
+
define float @float_and_v(float %cc_dep1) {
; CHECK-LABEL: float_and_v:
; CHECK: # %bb.0: # %entry
>From b1b95b49ab7bcda8dc8e0c108315b3d2a54001f2 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Mon, 2 Dec 2024 13:37:44 -0600
Subject: [PATCH 3/3] Use 4-byte spill size in case of no vector support.
---
llvm/lib/Target/SystemZ/SystemZFeatures.td | 2 +
llvm/lib/Target/SystemZ/SystemZInstrFP.td | 4 ++
llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 59 +++-------------
.../Target/SystemZ/SystemZRegisterInfo.cpp | 5 +-
llvm/lib/Target/SystemZ/SystemZRegisterInfo.h | 2 +-
.../lib/Target/SystemZ/SystemZRegisterInfo.td | 11 ++-
llvm/lib/Target/SystemZ/SystemZScheduleZ13.td | 4 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ14.td | 4 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ15.td | 4 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ16.td | 4 +-
.../lib/Target/SystemZ/SystemZScheduleZ196.td | 4 +-
.../Target/SystemZ/SystemZScheduleZEC12.td | 4 +-
llvm/test/CodeGen/SystemZ/spill-half-01.mir | 68 ++++++++++++-------
llvm/test/CodeGen/SystemZ/spill-half-02.mir | 19 +-----
14 files changed, 85 insertions(+), 109 deletions(-)
diff --git a/llvm/lib/Target/SystemZ/SystemZFeatures.td b/llvm/lib/Target/SystemZ/SystemZFeatures.td
index e6b95d32c29fad5..e86c332a8100719 100644
--- a/llvm/lib/Target/SystemZ/SystemZFeatures.td
+++ b/llvm/lib/Target/SystemZ/SystemZFeatures.td
@@ -196,6 +196,8 @@ def FeatureVector : SystemZFeature<
>;
def FeatureNoVector : SystemZMissingFeature<"Vector">;
+def NoVecHwMode : HwMode<"-vector", [FeatureNoVector]>;
+
def Arch11NewFeatures : SystemZFeatureList<[
FeatureLoadAndZeroRightmostByte,
FeatureLoadStoreOnCond2,
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
index 36e975a233b395a..3c819db82d3cb3d 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -134,6 +134,8 @@ defm LoadStoreF128 : MVCLoadStore<load, f128, MVCImm, 15>;
//===----------------------------------------------------------------------===//
let canFoldAsLoad = 1, SimpleBDXLoad = 1, mayLoad = 1 in {
+ let isCodeGenOnly = 1 in // Reload f16 from 4-byte spill slot.
+ defm LE16 : UnaryRXPair<"le", 0x78, 0xED64, z_load, FP16, 4>;
defm LE : UnaryRXPair<"le", 0x78, 0xED64, z_load, FP32, 4>;
defm LD : UnaryRXPair<"ld", 0x68, 0xED65, z_load, FP64, 8>;
@@ -154,6 +156,8 @@ let canFoldAsLoad = 1, SimpleBDXLoad = 1, mayLoad = 1 in {
//===----------------------------------------------------------------------===//
let SimpleBDXStore = 1, mayStore = 1 in {
+ let isCodeGenOnly = 1 in // Spill f16 to 4-byte spill slot.
+ defm STE16 : StoreRXPair<"ste", 0x70, 0xED66, store, FP16, 4>;
defm STE : StoreRXPair<"ste", 0x70, 0xED66, store, FP32, 4>;
defm STD : StoreRXPair<"std", 0x60, 0xED67, store, FP64, 8>;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index b0c0d76faa4c463..150c124810f4b3a 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -61,7 +61,8 @@ void SystemZInstrInfo::anchor() {}
SystemZInstrInfo::SystemZInstrInfo(SystemZSubtarget &sti)
: SystemZGenInstrInfo(-1, -1),
- RI(sti.getSpecialRegisters()->getReturnFunctionAddressRegister()),
+ RI(sti.getSpecialRegisters()->getReturnFunctionAddressRegister(),
+ sti.getHwMode()),
STI(sti) {}
// MI is a 128-bit load or store. Split it into two 64-bit loads or stores,
@@ -996,31 +997,8 @@ void SystemZInstrInfo::storeRegToStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
bool isKill, int FrameIdx, const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI, Register VReg) const {
- MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
- // Without vector support, there are no fp16 load/store instructions, so
- // need to save/restore via GPR.
- if (RC == &SystemZ::FP16BitRegClass && !STI.hasVector()) {
- assert(!MRI.isSSA() && MRI.getNumVirtRegs() &&
- "Expected non-SSA form with virtual registers.");
- Register GR64Reg = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
- Register FP64Reg = MRI.createVirtualRegister(&SystemZ::FP64BitRegClass);
- BuildMI(MBB, MBBI, DL, get(SystemZ::COPY))
- .addReg(FP64Reg, RegState::DefineNoRead, SystemZ::subreg_h16)
- .addReg(SrcReg, getKillRegState(isKill));
- BuildMI(MBB, MBBI, DL, get(SystemZ::LGDR), GR64Reg)
- .addReg(FP64Reg, RegState::Kill);
- BuildMI(MBB, MBBI, DL, get(SystemZ::SRLG), GR64Reg)
- .addReg(GR64Reg)
- .addReg(0)
- .addImm(48);
- addFrameReference(BuildMI(MBB, MBBI, DL, get(SystemZ::STH))
- .addReg(GR64Reg, RegState::Kill, SystemZ::subreg_l32),
- FrameIdx);
- return;
- }
-
// Callers may expect a single instruction, so keep 128-bit moves
// together for now and lower them after register allocation.
unsigned LoadOpcode, StoreOpcode;
@@ -1036,31 +1014,8 @@ void SystemZInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI,
Register VReg) const {
- MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
- // Without vector support, there are no fp16 load/store instructions, so
- // need to save/restore via GPR.
- if (RC == &SystemZ::FP16BitRegClass && !STI.hasVector()) {
- assert(!MRI.isSSA() && MRI.getNumVirtRegs() &&
- "Expected non-SSA form with virtual registers.");
- Register GR64Reg = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
- Register FP64Reg = MRI.createVirtualRegister(&SystemZ::FP64BitRegClass);
- addFrameReference(
- BuildMI(MBB, MBBI, DL, get(SystemZ::LH))
- .addReg(GR64Reg, RegState::DefineNoRead, SystemZ::subreg_l32),
- FrameIdx);
- BuildMI(MBB, MBBI, DL, get(SystemZ::SLLG), GR64Reg)
- .addReg(GR64Reg)
- .addReg(0)
- .addImm(48);
- BuildMI(MBB, MBBI, DL, get(SystemZ::LDGR), FP64Reg)
- .addReg(GR64Reg, RegState::Kill);
- BuildMI(MBB, MBBI, DL, get(SystemZ::COPY), DestReg)
- .addReg(FP64Reg, RegState::Kill, SystemZ::subreg_h16);
- return;
- }
-
// Callers may expect a single instruction, so keep 128-bit moves
// together for now and lower them after register allocation.
unsigned LoadOpcode, StoreOpcode;
@@ -1281,9 +1236,10 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
return nullptr;
unsigned OpNum = Ops[0];
- assert(Size * 8 ==
- TRI->getRegSizeInBits(*MF.getRegInfo()
- .getRegClass(MI.getOperand(OpNum).getReg())) &&
+ const TargetRegisterClass *RC =
+ MF.getRegInfo().getRegClass(MI.getOperand(OpNum).getReg());
+ assert((Size * 8 == TRI->getRegSizeInBits(*RC) ||
+ (RC == &SystemZ::FP16BitRegClass && Size == 4 && !STI.hasVector())) &&
"Invalid size combination");
if ((Opcode == SystemZ::AHI || Opcode == SystemZ::AGHI) && OpNum == 0 &&
@@ -1922,6 +1878,9 @@ void SystemZInstrInfo::getLoadStoreOpcodes(const TargetRegisterClass *RC,
RC == &SystemZ::ADDR128BitRegClass) {
LoadOpcode = SystemZ::L128;
StoreOpcode = SystemZ::ST128;
+ } else if (RC == &SystemZ::FP16BitRegClass && !STI.hasVector()) {
+ LoadOpcode = SystemZ::LE16;
+ StoreOpcode = SystemZ::STE16;
} else if (RC == &SystemZ::FP32BitRegClass) {
LoadOpcode = SystemZ::LE;
StoreOpcode = SystemZ::STE;
diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index d246d3f3c5bd11f..4b884cc6919cc45 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -233,8 +233,9 @@ SystemZELFRegisters::getCallPreservedMask(const MachineFunction &MF,
return CSR_SystemZ_ELF_RegMask;
}
-SystemZRegisterInfo::SystemZRegisterInfo(unsigned int RA)
- : SystemZGenRegisterInfo(RA) {}
+SystemZRegisterInfo::SystemZRegisterInfo(unsigned int RA, unsigned int HwMode)
+ : SystemZGenRegisterInfo(RA, /*DwarfFlavour=*/0, /*EHFlavour=*/0, /*PC=*/0,
+ HwMode) {}
const MCPhysReg *
SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
index cbc02c73f1ac70b..e4aa0d9d45a7d20 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -129,7 +129,7 @@ class SystemZELFRegisters : public SystemZCallingConventionRegisters {
struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
public:
- SystemZRegisterInfo(unsigned int RA);
+ SystemZRegisterInfo(unsigned int RA, unsigned int HwMode);
/// getPointerRegClass - Return the register class to use to hold pointers.
/// This is currently only used by LOAD_STACK_GUARD, which requires a non-%r0
diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
index 1dfe264b501b1c4..7da4c6cd78db9a7 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -35,7 +35,9 @@ def subreg_ll32 : ComposedSubRegIndex<subreg_l64, subreg_l32>;
// If the user provides an alternate order list of regs, it will be used for
// XPLINK. Otherwise, by default, XPLINK will use the regList ordering as well
multiclass SystemZRegClass<string name, list<ValueType> types, int size,
- dag regList, list<dag> altRegList = [regList], bit allocatable = 1> {
+ dag regList, list<dag> altRegList = [regList],
+ bit allocatable = 1,
+ RegInfoByHwMode RI = RegInfoByHwMode<[],[]>> {
def AsmOperand : AsmOperandClass {
let Name = name;
let ParserMethod = "parse"#name;
@@ -49,6 +51,7 @@ multiclass SystemZRegClass<string name, list<ValueType> types, int size,
const SystemZSubtarget &S = MF.getSubtarget<SystemZSubtarget>();
return S.isTargetXPLINK64();
}];
+ let RegInfos = RI;
}
def "" : RegisterOperand<!cast<RegisterClass>(name#"Bit")> {
let ParserMatchClass = !cast<AsmOperandClass>(name#"AsmOperand");
@@ -250,7 +253,11 @@ foreach I = [0, 1, 4, 5, 8, 9, 12, 13] in {
// There's no store-multiple instruction for FPRs, so we're not fussy
// about the order in which call-saved registers are allocated.
-defm FP16 : SystemZRegClass<"FP16", [f16], 16, (sequence "F%uH", 0, 15)>;
+// Adjust the spill size of f16 to 32 bits in case of no vector support.
+def FP16RI : RegInfoByHwMode<[DefaultMode, NoVecHwMode],
+ [RegInfo<16,16,16>, RegInfo<16,32,32>]>;
+defm FP16 : SystemZRegClass<"FP16", [f16], 16, (sequence "F%uH", 0, 15),
+ [(sequence "F%uH", 0, 15)], 1, FP16RI>;
defm FP32 : SystemZRegClass<"FP32", [f32], 32, (sequence "F%uS", 0, 15)>;
defm FP64 : SystemZRegClass<"FP64", [f64], 64, (sequence "F%uD", 0, 15)>;
defm FP128 : SystemZRegClass<"FP128", [f128], 128,
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
index c449b2d400777ee..5e94e0ac9e4ff9a 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -793,7 +793,7 @@ def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s)(d|s)$")>;
// FP: Load instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "LE(Y)?$")>;
+def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "L(E16|E)(Y)?$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LD(Y|E32)?$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
@@ -801,7 +801,7 @@ def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
// FP: Store instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E16|E|D)(Y)?$")>;
def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STX$")>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
index b17c6c6e611b100..847a8f5664d2f5a 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
@@ -813,7 +813,7 @@ def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s)(d|s)$")>;
// FP: Load instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "LE(Y)?$")>;
+def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "L(E16|E)(Y)?$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LD(Y|E32)?$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
@@ -821,7 +821,7 @@ def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
// FP: Store instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E16|E|D)(Y)?$")>;
def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STX$")>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
index 3d0bb0ed902265f..d8e02f0b7375814 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
@@ -831,7 +831,7 @@ def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s)(d|s)$")>;
// FP: Load instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "LE(Y)?$")>;
+def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "L(E16|E)(Y)?$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LD(Y|E32)?$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
@@ -839,7 +839,7 @@ def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
// FP: Store instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E16|E|D)(Y)?$")>;
def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STX$")>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
index 51b422f5f533736..7134de0e8815897 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
@@ -832,7 +832,7 @@ def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s)(d|s)$")>;
// FP: Load instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "LE(Y)?$")>;
+def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "L(E16|E)(Y)?$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LD(Y|E32)?$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
@@ -840,7 +840,7 @@ def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
// FP: Store instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E16|E|D)(Y)?$")>;
def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STX$")>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
index e93f329fb286f49..4ce38d79a797e78 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -725,14 +725,14 @@ def : InstRW<[WLat5, FXU2, GroupAlone], (instregex "CPSDR(d|s)(d|s)$")>;
// FP: Load instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(E|D)(Y|E32)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(E16|E|D)(Y|E32)?$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
//===----------------------------------------------------------------------===//
// FP: Store instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "ST(E16|E|D)(Y)?$")>;
def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STX$")>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td b/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
index 95dfab6c476bf32..d3ae3914154a0c0 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -763,14 +763,14 @@ def : InstRW<[WLat5, FXU2, GroupAlone], (instregex "CPSDR(d|s)(d|s)$")>;
// FP: Load instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(E|D)(Y|E32)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(E16|E|D)(Y|E32)?$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
//===----------------------------------------------------------------------===//
// FP: Store instructions
//===----------------------------------------------------------------------===//
-def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>;
+def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "ST(E16|E|D)(Y)?$")>;
def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STX$")>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/SystemZ/spill-half-01.mir b/llvm/test/CodeGen/SystemZ/spill-half-01.mir
index 56f4ecbffd2c639..bc6947fdec8a631 100644
--- a/llvm/test/CodeGen/SystemZ/spill-half-01.mir
+++ b/llvm/test/CodeGen/SystemZ/spill-half-01.mir
@@ -3,45 +3,61 @@
# RUN: llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
# RUN: -start-before=greedy | FileCheck %s -check-prefix=VECTOR
-# Test spilling / reloading of an fp16bit virtual register.
+# Test spilling / reloading fp16bit virtual registers.
---
name: fun0
-alignment: 16
tracksRegLiveness: true
-registers:
- - { id: 0, class: fp16bit }
-liveins:
- - { reg: '$f0h', virtual-reg: '%0' }
-frameInfo:
- maxAlignment: 1
-machineFunctionInfo: {}
body: |
bb.0:
- liveins: $f0h
+ liveins: $f0h, $f2h, $f4h
; CHECK-LABEL: fun0:
- ; CHECK-NOT: $f0
- ; CHECK: # kill: def $f0h killed $f0h killed $f0d def $f0d
- ; CHECK-NEXT: lgdr %r0, %f0
- ; CHECK-NEXT: srlg %r0, %r0, 48
- ; CHECK-NEXT: sth %r0, 166(%r15) # 2-byte Folded Spill
+ ; CHECK: aghi %r15, -240
+ ; CHECK: ste %f4, 172(%r15) # 4-byte Folded Spill
+ ; CHECK-NEXT: ste %f2, 164(%r15) # 4-byte Folded Spill
+ ; CHECK-NEXT: ste %f0, 168(%r15) # 4-byte Folded Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
- ; CHECK: lh %r0, 166(%r15) # 2-byte Folded Reload
- ; CHECK-NEXT: sllg %r0, %r0, 48
- ; CHECK-NEXT: ldgr %f0, %r0
- ; CHECK: # kill: def $f0h killed $f0h killed $f0d
- ; CHECK-NOT: $f0
+ ; CHECK-NEXT: le %f0, 164(%r15) # 4-byte Folded Reload
+ ; CHECK: le %f0, 168(%r15) # 4-byte Folded Reload
+ ; CHECK: le %f0, 172(%r15) # 4-byte Folded Reload
; VECTOR-LABEL: fun0:
- ; VECTOR: vsteh %v0, 166(%r15), 0 # 2-byte Folded Spill
- ; VECTOR-NEXT: #APP
- ; VECTOR-NEXT: #NO_APP
- ; VECTOR-NEXT: vlreph %v0, 166(%r15) # 2-byte Folded Reload
-
+ ; VECTOR: aghi %r15, -232
+ ; VECTOR: vsteh %v4, 166(%r15), 0 # 2-byte Folded Spil
+ ; VECTOR-NEXT: vsteh %v2, 162(%r15), 0 # 2-byte Folded Spil
+ ; VECTOR-NEXT: vsteh %v0, 164(%r15), 0 # 2-byte Folded Spil
+ ; VECTOR-NEXT: #APP
+ ; VECTOR-NEXT: #NO_APP
+ ; VECTOR-NEXT: vlreph %v0, 162(%r15) # 2-byte Folded Reload
+ ; VECTOR: vlreph %v0, 164(%r15) # 2-byte Folded Reload
+ ; VECTOR: vlreph %v0, 166(%r15) # 2-byte Folded Reload
+
+ %2:fp16bit = COPY $f4h
+ %1:fp16bit = COPY $f2h
%0:fp16bit = COPY $f0h
- INLINEASM &"", 1, 12, implicit-def dead early-clobber $r0d, 12, implicit-def dead early-clobber $r1d, 12, implicit-def dead early-clobber $r2d, 12, implicit-def dead early-clobber $r3d, 12, implicit-def dead early-clobber $r4d, 12, implicit-def dead early-clobber $r5d, 12, implicit-def dead early-clobber $r6d, 12, implicit-def dead early-clobber $r7d, 12, implicit-def dead early-clobber $r8d, 12, implicit-def dead early-clobber $r9d, 12, implicit-def dead early-clobber $r10d, 12, implicit-def dead early-clobber $r11d, 12, implicit-def dead early-clobber $r12d, 12, implicit-def dead early-clobber $r13d, 12, implicit-def dead early-clobber $r14d, 12, implicit-def dead early-clobber $f0d, 12, implicit-def dead early-clobber $f1d, 12, implicit-def dead early-clobber $f2d, 12, implicit-def dead early-clobber $f3d, 12, implicit-def dead early-clobber $f4d, 12, implicit-def dead early-clobber $f5d, 12, implicit-def dead early-clobber $f6d, 12, implicit-def dead early-clobber $f7d, 12, implicit-def dead early-clobber $f8d, 12, implicit-def dead early-clobber $f9d, 12, implicit-def dead early-clobber $f10d, 12, implicit-def dead early-clobber $f11d, 12, implicit-def dead early-clobber $f12d, 12, implicit-def dead early-clobber $f13d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f15d
+ INLINEASM &"", 1, 12, implicit-def dead early-clobber $f0d, 12, implicit-def dead early-clobber $f1d, 12, implicit-def dead early-clobber $f2d, 12, implicit-def dead early-clobber $f3d, 12, implicit-def dead early-clobber $f4d, 12, implicit-def dead early-clobber $f5d, 12, implicit-def dead early-clobber $f6d, 12, implicit-def dead early-clobber $f7d, 12, implicit-def dead early-clobber $f8d, 12, implicit-def dead early-clobber $f9d, 12, implicit-def dead early-clobber $f10d, 12, implicit-def dead early-clobber $f11d, 12, implicit-def dead early-clobber $f12d, 12, implicit-def dead early-clobber $f13d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f15d
+ $f0h = COPY %1
+ CallBRASL &__extendhfsf2, $f0h, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $f0s
+ %3:fp32bit = COPY $f0s
$f0h = COPY %0
+ CallBRASL &__extendhfsf2, $f0h, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $f0s
+ %5:fp32bit = COPY $f0s
+ %5:fp32bit = nofpexcept AEBR %5, %3, implicit-def dead $cc, implicit $fpc
+ $f0s = COPY %5
+ CallBRASL &__truncsfhf2, $f0s, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $f0h
+ %6:fp16bit = COPY $f0h
+ $f0h = COPY %6
+ CallBRASL &__extendhfsf2, $f0h, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $f0s
+ %7:fp32bit = COPY $f0s
+ $f0h = COPY %2
+ CallBRASL &__extendhfsf2, $f0h, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $f0s
+ %9:fp32bit = COPY $f0s
+ %9:fp32bit = nofpexcept AEBR %9, %7, implicit-def dead $cc, implicit $fpc
+ $f0s = COPY %9
+ CallBRASL &__truncsfhf2, $f0s, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $f0h
+ %10:fp16bit = COPY $f0h
+ $f0h = COPY %10
Return implicit $f0h
...
diff --git a/llvm/test/CodeGen/SystemZ/spill-half-02.mir b/llvm/test/CodeGen/SystemZ/spill-half-02.mir
index 4934d0b72811575..9ee2228612f50e6 100644
--- a/llvm/test/CodeGen/SystemZ/spill-half-02.mir
+++ b/llvm/test/CodeGen/SystemZ/spill-half-02.mir
@@ -5,36 +5,23 @@
---
name: fun0
-alignment: 16
tracksRegLiveness: true
-registers:
- - { id: 0, class: addr64bit }
- - { id: 1, class: addr64bit }
- - { id: 2, class: vr16bit }
-liveins:
- - { reg: '$r2d', virtual-reg: '%0' }
- - { reg: '$r3d', virtual-reg: '%1' }
-frameInfo:
- maxAlignment: 1
-machineFunctionInfo: {}
body: |
bb.0:
liveins: $r2d, $r3d
; CHECK-LABEL: fun0:
- ; CHECK: stg %r3, 168(%r15) # 8-byte Folded Spill
- ; CHECK-NEXT: vlreph %v0, 0(%r2)
+ ; CHECK: vlreph %v0, 0(%r2)
; CHECK-NEXT: vsteh %v0, 166(%r15), 0 # 2-byte Folded Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
- ; CHECK-NEXT: lg %r1, 168(%r15) # 8-byte Folded Reload
; CHECK-NEXT: vlreph %v0, 166(%r15) # 2-byte Folded Reload
- ; CHECK-NEXT: vsteh %v0, 0(%r1), 0
+ ; CHECK-NEXT: vsteh %v0, 0(%r3), 0
%1:addr64bit = COPY $r3d
%0:addr64bit = COPY $r2d
%2:vr16bit = VL16 %0, 0, $noreg
- INLINEASM &"", 1, 12, implicit-def dead early-clobber $r0d, 12, implicit-def dead early-clobber $r1d, 12, implicit-def dead early-clobber $r2d, 12, implicit-def dead early-clobber $r3d, 12, implicit-def dead early-clobber $r4d, 12, implicit-def dead early-clobber $r5d, 12, implicit-def dead early-clobber $r6d, 12, implicit-def dead early-clobber $r7d, 12, implicit-def dead early-clobber $r8d, 12, implicit-def dead early-clobber $r9d, 12, implicit-def dead early-clobber $r10d, 12, implicit-def dead early-clobber $r11d, 12, implicit-def dead early-clobber $r12d, 12, implicit-def dead early-clobber $r13d, 12, implicit-def dead early-clobber $r14d, 12, implicit-def dead early-clobber $f0d, 12, implicit-def dead early-clobber $f1d, 12, implicit-def dead early-clobber $f2d, 12, implicit-def dead early-clobber $f3d, 12, implicit-def dead early-clobber $f4d, 12, implicit-def dead early-clobber $f5d, 12, implicit-def dead early-clobber $f6d, 12, implicit-def dead early-clobber $f7d, 12, implicit-def dead early-clobber $f8d, 12, implicit-def dead early-clobber $f9d, 12, implicit-def dead early-clobber $f10d, 12, implicit-def dead early-clobber $f11d, 12, implicit-def dead early-clobber $f12d, 12, implicit-def dead early-clobber $f13d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f15d, 12, implicit-def dead early-clobber $f16d, 12, implicit-def dead early-clobber $f17d, 12, implicit-def dead early-clobber $f18d, 12, implicit-def dead early-clobber $f19d, 12, implicit-def dead early-clobber $f20d, 12, implicit-def dead early-clobber $f21d, 12, implicit-def dead early-clobber $f22d, 12, implicit-def dead early-clobber $f23d, 12, implicit-def dead early-clobber $f24d, 12, implicit-def dead early-clobber $f25d, 12, implicit-def dead early-clobber $f26d, 12, implicit-def dead early-clobber $f27d, 12, implicit-def dead early-clobber $f28d, 12, implicit-def dead early-clobber $f29d, 12, implicit-def dead early-clobber $f30d, 12, implicit-def dead early-clobber $f31d
+ INLINEASM &"", 1, 12, implicit-def dead early-clobber $f0d, 12, implicit-def dead early-clobber $f1d, 12, implicit-def dead early-clobber $f2d, 12, implicit-def dead early-clobber $f3d, 12, implicit-def dead early-clobber $f4d, 12, implicit-def dead early-clobber $f5d, 12, implicit-def dead early-clobber $f6d, 12, implicit-def dead early-clobber $f7d, 12, implicit-def dead early-clobber $f8d, 12, implicit-def dead early-clobber $f9d, 12, implicit-def dead early-clobber $f10d, 12, implicit-def dead early-clobber $f11d, 12, implicit-def dead early-clobber $f12d, 12, implicit-def dead early-clobber $f13d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f15d, 12, implicit-def dead early-clobber $f16d, 12, implicit-def dead early-clobber $f17d, 12, implicit-def dead early-clobber $f18d, 12, implicit-def dead early-clobber $f19d, 12, implicit-def dead early-clobber $f20d, 12, implicit-def dead early-clobber $f21d, 12, implicit-def dead early-clobber $f22d, 12, implicit-def dead early-clobber $f23d, 12, implicit-def dead early-clobber $f24d, 12, implicit-def dead early-clobber $f25d, 12, implicit-def dead early-clobber $f26d, 12, implicit-def dead early-clobber $f27d, 12, implicit-def dead early-clobber $f28d, 12, implicit-def dead early-clobber $f29d, 12, implicit-def dead early-clobber $f30d, 12, implicit-def dead early-clobber $f31d
VST16 %2, %1, 0, $noreg
Return
...
More information about the cfe-commits
mailing list