[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)
Jonas Paulsson via cfe-commits
cfe-commits at lists.llvm.org
Tue Nov 19 17:51:41 PST 2024
https://github.com/JonPsson1 updated https://github.com/llvm/llvm-project/pull/109164
>From d693345f70b4c518190e8795341517da87d4879a Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Tue, 17 Sep 2024 19:34:34 +0200
Subject: [PATCH 1/6] Initial experiments (with integer regs for fp16).
---
clang/lib/Basic/Targets/SystemZ.h | 12 ++
clang/lib/CodeGen/Targets/SystemZ.cpp | 12 +-
clang/lib/Sema/SemaExpr.cpp | 2 +-
.../test/CodeGen/SystemZ/fexcess-precision.c | 85 ++++++++
clang/test/CodeGen/SystemZ/systemz-abi.c | 44 ++++
llvm/lib/Target/SystemZ/SystemZCallingConv.td | 4 +-
.../Target/SystemZ/SystemZISelLowering.cpp | 62 +++++-
llvm/lib/Target/SystemZ/SystemZISelLowering.h | 16 +-
llvm/test/CodeGen/SystemZ/fp-half.ll | 201 ++++++++++++++++++
9 files changed, 420 insertions(+), 18 deletions(-)
create mode 100644 clang/test/CodeGen/SystemZ/fexcess-precision.c
create mode 100644 llvm/test/CodeGen/SystemZ/fp-half.ll
diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h
index ef9a07033a6e4f..f665a0b72a8e38 100644
--- a/clang/lib/Basic/Targets/SystemZ.h
+++ b/clang/lib/Basic/Targets/SystemZ.h
@@ -91,11 +91,23 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
"-v128:64-a:8:16-n32:64");
}
MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 128;
+
+ // True if the backend supports operations on the half LLVM IR type.
+ HasLegalHalfType = false;
+ // Allow half arguments and return values.
+ HalfArgsAndReturns = true;
+ // Support _Float16.
+ HasFloat16 = true;
+
HasStrictFP = true;
}
unsigned getMinGlobalAlign(uint64_t Size, bool HasNonWeakDef) const override;
+ bool useFP16ConversionIntrinsics() const override {
+ return false;
+ }
+
void getTargetDefines(const LangOptions &Opts,
MacroBuilder &Builder) const override;
diff --git a/clang/lib/CodeGen/Targets/SystemZ.cpp b/clang/lib/CodeGen/Targets/SystemZ.cpp
index 23c96fa5cf98cb..5e313c999240af 100644
--- a/clang/lib/CodeGen/Targets/SystemZ.cpp
+++ b/clang/lib/CodeGen/Targets/SystemZ.cpp
@@ -185,6 +185,8 @@ bool SystemZABIInfo::isFPArgumentType(QualType Ty) const {
if (const BuiltinType *BT = Ty->getAs<BuiltinType>())
switch (BT->getKind()) {
+// case BuiltinType::Half: // __fp16 Support __fp16??
+ case BuiltinType::Float16: // _Float16
case BuiltinType::Float:
case BuiltinType::Double:
return true;
@@ -277,7 +279,8 @@ RValue SystemZABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
} else {
if (AI.getCoerceToType())
ArgTy = AI.getCoerceToType();
- InFPRs = (!IsSoftFloatABI && (ArgTy->isFloatTy() || ArgTy->isDoubleTy()));
+ InFPRs = (!IsSoftFloatABI &&
+ (ArgTy->isHalfTy() || ArgTy->isFloatTy() || ArgTy->isDoubleTy()));
IsVector = ArgTy->isVectorTy();
UnpaddedSize = TyInfo.Width;
DirectAlign = TyInfo.Align;
@@ -446,10 +449,11 @@ ABIArgInfo SystemZABIInfo::classifyArgumentType(QualType Ty) const {
// The structure is passed as an unextended integer, a float, or a double.
if (isFPArgumentType(SingleElementTy)) {
- assert(Size == 32 || Size == 64);
+ assert(Size == 16 || Size == 32 || Size == 64);
return ABIArgInfo::getDirect(
- Size == 32 ? llvm::Type::getFloatTy(getVMContext())
- : llvm::Type::getDoubleTy(getVMContext()));
+ Size == 16 ? llvm::Type::getHalfTy(getVMContext())
+ : Size == 32 ? llvm::Type::getFloatTy(getVMContext())
+ : llvm::Type::getDoubleTy(getVMContext()));
} else {
llvm::IntegerType *PassTy = llvm::IntegerType::get(getVMContext(), Size);
return Size <= 32 ? ABIArgInfo::getNoExtend(PassTy)
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index dcf495b700540f..d546a627a7bf68 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -16626,7 +16626,7 @@ ExprResult Sema::BuildVAArgExpr(SourceLocation BuiltinLoc,
PromoteType = QualType();
}
}
- if (TInfo->getType()->isSpecificBuiltinType(BuiltinType::Float))
+ if (TInfo->getType()->isFloat16Type() || TInfo->getType()->isFloat32Type())
PromoteType = Context.DoubleTy;
if (!PromoteType.isNull())
DiagRuntimeBehavior(TInfo->getTypeLoc().getBeginLoc(), E,
diff --git a/clang/test/CodeGen/SystemZ/fexcess-precision.c b/clang/test/CodeGen/SystemZ/fexcess-precision.c
new file mode 100644
index 00000000000000..4444dbdcc23ca0
--- /dev/null
+++ b/clang/test/CodeGen/SystemZ/fexcess-precision.c
@@ -0,0 +1,85 @@
+// RUN: %clang_cc1 -triple s390x-linux-gnu \
+// RUN: -ffloat16-excess-precision=standard -emit-llvm -o - %s \
+// RUN: | FileCheck %s -check-prefix=STANDARD
+
+// RUN: %clang_cc1 -triple s390x-linux-gnu \
+// RUN: -ffloat16-excess-precision=none -emit-llvm -o - %s \
+// RUN: | FileCheck %s -check-prefix=NONE
+
+// RUN: %clang_cc1 -triple s390x-linux-gnu \
+// RUN: -ffloat16-excess-precision=fast -emit-llvm -o - %s \
+// RUN: | FileCheck %s -check-prefix=FAST
+
+_Float16 f(_Float16 a, _Float16 b, _Float16 c, _Float16 d) {
+ return a * b + c * d;
+}
+
+// STANDARD-LABEL: define dso_local half @f(half noundef %a, half noundef %b, half noundef %c, half noundef %d) #0 {
+// STANDARD-NEXT: entry:
+// STANDARD-NEXT: %a.addr = alloca half, align 2
+// STANDARD-NEXT: %b.addr = alloca half, align 2
+// STANDARD-NEXT: %c.addr = alloca half, align 2
+// STANDARD-NEXT: %d.addr = alloca half, align 2
+// STANDARD-NEXT: store half %a, ptr %a.addr, align 2
+// STANDARD-NEXT: store half %b, ptr %b.addr, align 2
+// STANDARD-NEXT: store half %c, ptr %c.addr, align 2
+// STANDARD-NEXT: store half %d, ptr %d.addr, align 2
+// STANDARD-NEXT: %0 = load half, ptr %a.addr, align 2
+// STANDARD-NEXT: %ext = fpext half %0 to float
+// STANDARD-NEXT: %1 = load half, ptr %b.addr, align 2
+// STANDARD-NEXT: %ext1 = fpext half %1 to float
+// STANDARD-NEXT: %mul = fmul float %ext, %ext1
+// STANDARD-NEXT: %2 = load half, ptr %c.addr, align 2
+// STANDARD-NEXT: %ext2 = fpext half %2 to float
+// STANDARD-NEXT: %3 = load half, ptr %d.addr, align 2
+// STANDARD-NEXT: %ext3 = fpext half %3 to float
+// STANDARD-NEXT: %mul4 = fmul float %ext2, %ext3
+// STANDARD-NEXT: %add = fadd float %mul, %mul4
+// STANDARD-NEXT: %unpromotion = fptrunc float %add to half
+// STANDARD-NEXT: ret half %unpromotion
+// STANDARD-NEXT: }
+
+// NONE-LABEL: define dso_local half @f(half noundef %a, half noundef %b, half noundef %c, half noundef %d) #0 {
+// NONE-NEXT: entry:
+// NONE-NEXT: %a.addr = alloca half, align 2
+// NONE-NEXT: %b.addr = alloca half, align 2
+// NONE-NEXT: %c.addr = alloca half, align 2
+// NONE-NEXT: %d.addr = alloca half, align 2
+// NONE-NEXT: store half %a, ptr %a.addr, align 2
+// NONE-NEXT: store half %b, ptr %b.addr, align 2
+// NONE-NEXT: store half %c, ptr %c.addr, align 2
+// NONE-NEXT: store half %d, ptr %d.addr, align 2
+// NONE-NEXT: %0 = load half, ptr %a.addr, align 2
+// NONE-NEXT: %1 = load half, ptr %b.addr, align 2
+// NONE-NEXT: %mul = fmul half %0, %1
+// NONE-NEXT: %2 = load half, ptr %c.addr, align 2
+// NONE-NEXT: %3 = load half, ptr %d.addr, align 2
+// NONE-NEXT: %mul1 = fmul half %2, %3
+// NONE-NEXT: %add = fadd half %mul, %mul1
+// NONE-NEXT: ret half %add
+// NONE-NEXT: }
+
+// FAST-LABEL: define dso_local half @f(half noundef %a, half noundef %b, half noundef %c, half noundef %d) #0 {
+// FAST-NEXT: entry:
+// FAST-NEXT: %a.addr = alloca half, align 2
+// FAST-NEXT: %b.addr = alloca half, align 2
+// FAST-NEXT: %c.addr = alloca half, align 2
+// FAST-NEXT: %d.addr = alloca half, align 2
+// FAST-NEXT: store half %a, ptr %a.addr, align 2
+// FAST-NEXT: store half %b, ptr %b.addr, align 2
+// FAST-NEXT: store half %c, ptr %c.addr, align 2
+// FAST-NEXT: store half %d, ptr %d.addr, align 2
+// FAST-NEXT: %0 = load half, ptr %a.addr, align 2
+// FAST-NEXT: %ext = fpext half %0 to float
+// FAST-NEXT: %1 = load half, ptr %b.addr, align 2
+// FAST-NEXT: %ext1 = fpext half %1 to float
+// FAST-NEXT: %mul = fmul float %ext, %ext1
+// FAST-NEXT: %2 = load half, ptr %c.addr, align 2
+// FAST-NEXT: %ext2 = fpext half %2 to float
+// FAST-NEXT: %3 = load half, ptr %d.addr, align 2
+// FAST-NEXT: %ext3 = fpext half %3 to float
+// FAST-NEXT: %mul4 = fmul float %ext2, %ext3
+// FAST-NEXT: %add = fadd float %mul, %mul4
+// FAST-NEXT: %unpromotion = fptrunc float %add to half
+// FAST-NEXT: ret half %unpromotion
+// FAST-NEXT: }
diff --git a/clang/test/CodeGen/SystemZ/systemz-abi.c b/clang/test/CodeGen/SystemZ/systemz-abi.c
index fd2b5d450cc643..2287126bdeabec 100644
--- a/clang/test/CodeGen/SystemZ/systemz-abi.c
+++ b/clang/test/CodeGen/SystemZ/systemz-abi.c
@@ -45,6 +45,9 @@ long long pass_longlong(long long arg) { return arg; }
__int128 pass_int128(__int128 arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_int128(ptr dead_on_unwind noalias writable sret(i128) align 8 %{{.*}}, ptr %0)
+_Float16 pass__Float16(_Float16 arg) { return arg; }
+// CHECK-LABEL: define{{.*}} half @pass__Float16(half %{{.*}})
+
float pass_float(float arg) { return arg; }
// CHECK-LABEL: define{{.*}} float @pass_float(float %{{.*}})
@@ -72,6 +75,9 @@ _Complex long pass_complex_long(_Complex long arg) { return arg; }
_Complex long long pass_complex_longlong(_Complex long long arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_complex_longlong(ptr dead_on_unwind noalias writable sret({ i64, i64 }) align 8 %{{.*}}, ptr %{{.*}}arg)
+_Complex _Float16 pass_complex__Float16(_Complex _Float16 arg) { return arg; }
+// CHECK-LABEL: define{{.*}} void @pass_complex__Float16(ptr dead_on_unwind noalias writable sret({ half, half }) align 2 %{{.*}}, ptr %{{.*}}arg)
+
_Complex float pass_complex_float(_Complex float arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_complex_float(ptr dead_on_unwind noalias writable sret({ float, float }) align 4 %{{.*}}, ptr %{{.*}}arg)
@@ -123,6 +129,11 @@ struct agg_16byte pass_agg_16byte(struct agg_16byte arg) { return arg; }
// Float-like aggregate types
+struct agg__Float16 { _Float16 a; };
+struct agg__Float16 pass_agg__Float16(struct agg__Float16 arg) { return arg; }
+// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, half %{{.*}})
+// SOFT-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, i16 noext %{{.*}})
+
struct agg_float { float a; };
struct agg_float pass_agg_float(struct agg_float arg) { return arg; }
// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg_float(ptr dead_on_unwind noalias writable sret(%struct.agg_float) align 4 %{{.*}}, float %{{.*}})
@@ -137,6 +148,11 @@ struct agg_longdouble { long double a; };
struct agg_longdouble pass_agg_longdouble(struct agg_longdouble arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_agg_longdouble(ptr dead_on_unwind noalias writable sret(%struct.agg_longdouble) align 8 %{{.*}}, ptr %{{.*}})
+struct agg__Float16_a8 { _Float16 a __attribute__((aligned (8))); };
+struct agg__Float16_a8 pass_agg__Float16_a8(struct agg__Float16_a8 arg) { return arg; }
+// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a8) align 8 %{{.*}}, double %{{.*}})
+// SOFT-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a8) align 8 %{{.*}}, i64 %{{.*}})
+
struct agg_float_a8 { float a __attribute__((aligned (8))); };
struct agg_float_a8 pass_agg_float_a8(struct agg_float_a8 arg) { return arg; }
// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg_float_a8(ptr dead_on_unwind noalias writable sret(%struct.agg_float_a8) align 8 %{{.*}}, double %{{.*}})
@@ -164,6 +180,10 @@ struct agg_nofloat3 pass_agg_nofloat3(struct agg_nofloat3 arg) { return arg; }
// Union types likewise are *not* float-like aggregate types
+union union__Float16 { _Float16 a; };
+union union__Float16 pass_union__Float16(union union__Float16 arg) { return arg; }
+// CHECK-LABEL: define{{.*}} void @pass_union__Float16(ptr dead_on_unwind noalias writable sret(%union.union__Float16) align 2 %{{.*}}, i16 noext %{{.*}})
+
union union_float { float a; };
union union_float pass_union_float(union union_float arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_union_float(ptr dead_on_unwind noalias writable sret(%union.union_float) align 4 %{{.*}}, i32 noext %{{.*}})
@@ -441,6 +461,30 @@ struct agg_8byte va_agg_8byte(__builtin_va_list l) { return __builtin_va_arg(l,
// CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi ptr [ [[RAW_REG_ADDR]], %{{.*}} ], [ [[RAW_MEM_ADDR]], %{{.*}} ]
// CHECK: ret void
+struct agg__Float16 va_agg__Float16(__builtin_va_list l) { return __builtin_va_arg(l, struct agg__Float16); }
+// CHECK-LABEL: define{{.*}} void @va_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, ptr %{{.*}}
+// HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 1
+// SOFT-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 0
+// CHECK: [[REG_COUNT:%[^ ]+]] = load i64, ptr [[REG_COUNT_PTR]]
+// HARD-FLOAT: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 4
+// SOFT-FLOAT: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5
+// CHECK: br i1 [[FITS_IN_REGS]],
+// CHECK: [[SCALED_REG_COUNT:%[^ ]+]] = mul i64 [[REG_COUNT]], 8
+// HARD-FLOAT: [[REG_OFFSET:%[^ ]+]] = add i64 [[SCALED_REG_COUNT]], 128
+// SOFT-FLOAT: [[REG_OFFSET:%[^ ]+]] = add i64 [[SCALED_REG_COUNT]], 22
+// CHECK: [[REG_SAVE_AREA_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 3
+// CHECK: [[REG_SAVE_AREA:%[^ ]+]] = load ptr, ptr [[REG_SAVE_AREA_PTR:[^ ]+]]
+// CHECK: [[RAW_REG_ADDR:%[^ ]+]] = getelementptr i8, ptr [[REG_SAVE_AREA]], i64 [[REG_OFFSET]]
+// CHECK: [[REG_COUNT1:%[^ ]+]] = add i64 [[REG_COUNT]], 1
+// CHECK: store i64 [[REG_COUNT1]], ptr [[REG_COUNT_PTR]]
+// CHECK: [[OVERFLOW_ARG_AREA_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 2
+// CHECK: [[OVERFLOW_ARG_AREA:%[^ ]+]] = load ptr, ptr [[OVERFLOW_ARG_AREA_PTR]]
+// CHECK: [[RAW_MEM_ADDR:%[^ ]+]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i64 6
+// CHECK: [[OVERFLOW_ARG_AREA2:%[^ ]+]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i64 8
+// CHECK: store ptr [[OVERFLOW_ARG_AREA2]], ptr [[OVERFLOW_ARG_AREA_PTR]]
+// CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi ptr [ [[RAW_REG_ADDR]], %{{.*}} ], [ [[RAW_MEM_ADDR]], %{{.*}} ]
+// CHECK: ret void
+
struct agg_float va_agg_float(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_float); }
// CHECK-LABEL: define{{.*}} void @va_agg_float(ptr dead_on_unwind noalias writable sret(%struct.agg_float) align 4 %{{.*}}, ptr %{{.*}}
// HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 1
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index 99bb697ce20142..4f736e1c171874 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -50,6 +50,7 @@ def RetCC_SystemZ_ELF : CallingConv<[
// other floating-point argument registers available for code that
// doesn't care about the ABI. All floating-point argument registers
// are call-clobbered, so we can use all of them here.
+ CCIfType<[f16], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
@@ -115,6 +116,7 @@ def CC_SystemZ_ELF : CallingConv<[
CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D, R6D]>>,
// The first 4 float and double arguments are passed in even registers F0-F6.
+ CCIfType<[f16], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
@@ -138,7 +140,7 @@ def CC_SystemZ_ELF : CallingConv<[
CCAssignToStack<16, 8>>>,
// Other arguments are passed in 8-byte-aligned 8-byte stack slots.
- CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
+ CCIfType<[i32, i64, f16, f32, f64], CCAssignToStack<8, 8>>
]>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 78d91299a357dd..505dfa89958a0b 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -711,6 +711,13 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BITCAST, MVT::f32, Custom);
}
+ // Expand FP16 <=> FP32 conversions to libcalls and handle FP16 loads and
+ // stores in GPRs.
+ setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+ setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+
// VASTART and VACOPY need to deal with the SystemZ-specific varargs
// structure, but VAEND is a no-op.
setOperationAction(ISD::VASTART, MVT::Other, Custom);
@@ -784,6 +791,20 @@ bool SystemZTargetLowering::useSoftFloat() const {
return Subtarget.hasSoftFloat();
}
+MVT SystemZTargetLowering::getRegisterTypeForCallingConv(
+ LLVMContext &Context, CallingConv::ID CC,
+ EVT VT) const {
+ // 128-bit single-element vector types are passed like other vectors,
+ // not like their element type.
+ if (VT.isVector() && VT.getSizeInBits() == 128 &&
+ VT.getVectorNumElements() == 1)
+ return MVT::v16i8;
+ // Keep f16 so that they can be recognized and handled.
+ if (VT == MVT::f16)
+ return MVT::f16;
+ return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+}
+
EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL,
LLVMContext &, EVT VT) const {
if (!VT.isVector())
@@ -1602,6 +1623,15 @@ bool SystemZTargetLowering::splitValueIntoRegisterParts(
return true;
}
+ // Convert f16 to f32 (Out-arg).
+ if (PartVT == MVT::f16) {
+ assert(NumParts == 1 && "");
+ SDValue I16Val = DAG.getBitcast(MVT::i16, Val);
+ SDValue I32Val = DAG.getAnyExtOrTrunc(I16Val, DL, MVT::i32);
+ Parts[0] = DAG.getBitcast(MVT::f32, I32Val);
+ return true;
+ }
+
return false;
}
@@ -1617,6 +1647,18 @@ SDValue SystemZTargetLowering::joinRegisterPartsIntoValue(
return SDValue();
}
+// F32Val holds a f16 value in f32, return it as an f16 (In-arg). The
+// CopyFromReg was made into an f32 as required as FP32 registers are used
+// for arguments, now convert it to f16.
+static SDValue convertF32ToF16(SDValue F32Val, SelectionDAG &DAG,
+ const SDLoc &DL) {
+ assert(F32Val->getOpcode() == ISD::CopyFromReg &&
+ "Only expecting to handle f16 with CopyFromReg here.");
+ SDValue I32Val = DAG.getBitcast(MVT::i32, F32Val);
+ SDValue I16Val = DAG.getAnyExtOrTrunc(I32Val, DL, MVT::i16);
+ return DAG.getBitcast(MVT::f16, I16Val);
+}
+
SDValue SystemZTargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -1656,6 +1698,7 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
NumFixedGPRs += 1;
RC = &SystemZ::GR64BitRegClass;
break;
+ case MVT::f16:
case MVT::f32:
NumFixedFPRs += 1;
RC = &SystemZ::FP32BitRegClass;
@@ -1680,7 +1723,11 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
Register VReg = MRI.createVirtualRegister(RC);
MRI.addLiveIn(VA.getLocReg(), VReg);
- ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
+ // Special handling is needed for f16.
+ MVT ArgVT = VA.getLocVT() == MVT::f16 ? MVT::f32 : VA.getLocVT();
+ ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, ArgVT);
+ if (VA.getLocVT() == MVT::f16)
+ ArgValue = convertF32ToF16(ArgValue, DAG, DL);
} else {
assert(VA.isMemLoc() && "Argument not register or memory");
@@ -1700,9 +1747,12 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
// from this parameter. Unpromoted ints and floats are
// passed as right-justified 8-byte values.
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
- if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
+ if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32 ||
+ VA.getLocVT() == MVT::f16) {
+ unsigned SlotOffs = VA.getLocVT() == MVT::f16 ? 6 : 4;
FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
- DAG.getIntPtrConstant(4, DL));
+ DAG.getIntPtrConstant(SlotOffs, DL));
+ }
ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN,
MachinePointerInfo::getFixedStack(MF, FI));
}
@@ -2121,10 +2171,14 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
// Copy all of the result registers out of their specified physreg.
for (CCValAssign &VA : RetLocs) {
// Copy the value out, gluing the copy to the end of the call sequence.
+ // Special handling is needed for f16.
+ MVT ArgVT = VA.getLocVT() == MVT::f16 ? MVT::f32 : VA.getLocVT();
SDValue RetValue = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(),
- VA.getLocVT(), Glue);
+ ArgVT, Glue);
Chain = RetValue.getValue(1);
Glue = RetValue.getValue(2);
+ if (VA.getLocVT() == MVT::f16)
+ RetValue = convertF32ToF16(RetValue, DAG, DL);
// Convert the value of the return register into the value that's
// being returned.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 3c06c1fdf2b1bc..ecffa4deea0f5a 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -447,14 +447,7 @@ class SystemZTargetLowering : public TargetLowering {
return TargetLowering::getNumRegisters(Context, VT);
}
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
- EVT VT) const override {
- // 128-bit single-element vector types are passed like other vectors,
- // not like their element type.
- if (VT.isVector() && VT.getSizeInBits() == 128 &&
- VT.getVectorNumElements() == 1)
- return MVT::v16i8;
- return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
- }
+ EVT VT) const override;
bool isCheapToSpeculateCtlz(Type *) const override { return true; }
bool isCheapToSpeculateCttz(Type *) const override { return true; }
bool preferZeroCompareBranch() const override { return true; }
@@ -476,6 +469,13 @@ class SystemZTargetLowering : public TargetLowering {
// LD, and having the full constant in memory enables reg/mem opcodes.
return VT != MVT::f64;
}
+ bool softPromoteHalfType() const override { return true; }
+ bool useFPRegsForHalfType() const override { return true; }
+ bool shouldKeepZExtForFP16Conv() const override {
+ // Keep the zero extension from 16 bits if present (as with incoming
+ // arguments).
+ return true;
+ }
bool hasInlineStackProbe(const MachineFunction &MF) const override;
AtomicExpansionKind shouldCastAtomicLoadInIR(LoadInst *LI) const override;
AtomicExpansionKind shouldCastAtomicStoreInIR(StoreInst *SI) const override;
diff --git a/llvm/test/CodeGen/SystemZ/fp-half.ll b/llvm/test/CodeGen/SystemZ/fp-half.ll
new file mode 100644
index 00000000000000..51ac42b012388e
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-half.ll
@@ -0,0 +1,201 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s
+;
+; Tests for 16-bit floating point (half).
+
+; Incoming half arguments added together and returned.
+define half @fun0(half %Op0, half %Op1) {
+; CHECK-LABEL: fun0:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
+; CHECK-NEXT: .cfi_offset %r13, -56
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -168
+; CHECK-NEXT: .cfi_def_cfa_offset 328
+; CHECK-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: vlgvf %r0, %v2, 0
+; CHECK-NEXT: llghr %r2, %r0
+; CHECK-NEXT: vlgvf %r13, %v0, 0
+; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
+; CHECK-NEXT: llghr %r2, %r13
+; CHECK-NEXT: ldr %f8, %f0
+; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
+; CHECK-NEXT: aebr %f0, %f8
+; CHECK-NEXT: brasl %r14, __gnu_f2h_ieee at PLT
+; CHECK-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: vlvgf %v0, %r2, 0
+; CHECK-NEXT: lmg %r13, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+entry:
+ %Res = fadd half %Op0, %Op1
+ ret half %Res
+}
+
+; The half values are loaded and stored instead.
+define void @fun1(ptr %Op0, ptr %Op1, ptr %Dst) {
+; CHECK-LABEL: fun1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: stmg %r12, %r15, 96(%r15)
+; CHECK-NEXT: .cfi_offset %r12, -64
+; CHECK-NEXT: .cfi_offset %r13, -56
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -168
+; CHECK-NEXT: .cfi_def_cfa_offset 328
+; CHECK-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: llgh %r12, 0(%r2)
+; CHECK-NEXT: llgh %r2, 0(%r3)
+; CHECK-NEXT: lgr %r13, %r4
+; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
+; CHECK-NEXT: lgr %r2, %r12
+; CHECK-NEXT: ldr %f8, %f0
+; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
+; CHECK-NEXT: aebr %f0, %f8
+; CHECK-NEXT: brasl %r14, __gnu_f2h_ieee at PLT
+; CHECK-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: sth %r2, 0(%r13)
+; CHECK-NEXT: lmg %r12, %r15, 264(%r15)
+; CHECK-NEXT: br %r14
+entry:
+ %0 = load half, ptr %Op0, align 2
+ %1 = load half, ptr %Op1, align 2
+ %add = fadd half %0, %1
+ store half %add, ptr %Dst, align 2
+ ret void
+}
+
+; Test a chain of half operations which should have each operation surrounded
+; by conversions to/from fp32 for proper emulation.
+define half @fun2(half %Op0, half %Op1, half %Op2) {
+; CHECK-LABEL: fun2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: stmg %r12, %r15, 96(%r15)
+; CHECK-NEXT: .cfi_offset %r12, -64
+; CHECK-NEXT: .cfi_offset %r13, -56
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -168
+; CHECK-NEXT: .cfi_def_cfa_offset 328
+; CHECK-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: vlgvf %r0, %v2, 0
+; CHECK-NEXT: llghr %r2, %r0
+; CHECK-NEXT: vlgvf %r13, %v4, 0
+; CHECK-NEXT: vlgvf %r12, %v0, 0
+; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
+; CHECK-NEXT: llghr %r2, %r12
+; CHECK-NEXT: ldr %f8, %f0
+; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
+; CHECK-NEXT: aebr %f0, %f8
+; CHECK-NEXT: brasl %r14, __gnu_f2h_ieee at PLT
+; CHECK-NEXT: llghr %r2, %r2
+; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
+; CHECK-NEXT: llghr %r2, %r13
+; CHECK-NEXT: ldr %f8, %f0
+; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
+; CHECK-NEXT: wfasb %f0, %f8, %f0
+; CHECK-NEXT: brasl %r14, __gnu_f2h_ieee at PLT
+; CHECK-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: vlvgf %v0, %r2, 0
+; CHECK-NEXT: lmg %r12, %r15, 264(%r15)
+; CHECK-NEXT: br %r14
+entry:
+ %A0 = fadd half %Op0, %Op1
+ %Res = fadd half %A0, %Op2
+ ret half %Res
+}
+
+; Store an incoming half argument and return a loaded one.
+define half @fun3(half %Op0, ptr %Dst, ptr %Src) {
+; CHECK-LABEL: fun3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vlgvf %r0, %v0, 0
+; CHECK-NEXT: sth %r0, 0(%r2)
+; CHECK-NEXT: lh %r0, 0(%r3)
+; CHECK-NEXT: vlvgf %v0, %r0, 0
+; CHECK-NEXT: br %r14
+entry:
+ store half %Op0, ptr %Dst
+
+ %Res = load half, ptr %Src
+ ret half %Res
+}
+
+; Call a function with half argument and return values.
+declare half @foo(half)
+define void @fun4(ptr %Src, ptr %Dst) {
+; CHECK-LABEL: fun4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
+; CHECK-NEXT: .cfi_offset %r13, -56
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: lh %r0, 0(%r2)
+; CHECK-NEXT: vlvgf %v0, %r0, 0
+; CHECK-NEXT: lgr %r13, %r3
+; CHECK-NEXT: brasl %r14, foo at PLT
+; CHECK-NEXT: vlgvf %r0, %v0, 0
+; CHECK-NEXT: sth %r0, 0(%r13)
+; CHECK-NEXT: lmg %r13, %r15, 264(%r15)
+; CHECK-NEXT: br %r14
+entry:
+ %arg = load half, ptr %Src
+ %Res = call half @foo(half %arg)
+ store half %Res, ptr %Dst
+ ret void
+}
+
+; Receive stack argument.
+define half @bar(half %Arg0, half %Arg1, half %Arg2, half %Arg3, half %Arg4) {
+; CHECK-LABEL: bar:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
+; CHECK-NEXT: .cfi_offset %r13, -56
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -168
+; CHECK-NEXT: .cfi_def_cfa_offset 328
+; CHECK-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: vlgvf %r0, %v6, 0
+; CHECK-NEXT: llgh %r13, 334(%r15)
+; CHECK-NEXT: llghr %r2, %r0
+; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
+; CHECK-NEXT: lgr %r2, %r13
+; CHECK-NEXT: ldr %f8, %f0
+; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
+; CHECK-NEXT: wfasb %f0, %f8, %f0
+; CHECK-NEXT: brasl %r14, __gnu_f2h_ieee at PLT
+; CHECK-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: vlvgf %v0, %r2, 0
+; CHECK-NEXT: lmg %r13, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+ %A0 = fadd half %Arg3, %Arg4
+ ret half %A0
+}
+
+; Pass stack argument.
+define void @fun5() {
+; CHECK-LABEL: fun5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -168
+; CHECK-NEXT: .cfi_def_cfa_offset 328
+; CHECK-NEXT: lzer %f0
+; CHECK-NEXT: ldr %f2, %f0
+; CHECK-NEXT: ldr %f4, %f0
+; CHECK-NEXT: ldr %f6, %f0
+; CHECK-NEXT: mvhi 164(%r15), 0
+; CHECK-NEXT: brasl %r14, bar at PLT
+; CHECK-NEXT: lmg %r14, %r15, 280(%r15)
+; CHECK-NEXT: br %r14
+ call void @bar (half 0.0, half 0.0, half 0.0, half 0.0, half 0.0)
+ ret void
+}
>From 3be4d235df48d6d2be3cab591637e69401a3f53b Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Mon, 28 Oct 2024 15:44:42 +0100
Subject: [PATCH 2/6] Experiment with soft-promotion in FP regs (not working).
---
llvm/lib/Target/SystemZ/SystemZCallingConv.td | 6 ++---
.../Target/SystemZ/SystemZISelLowering.cpp | 23 ++++++++++++-------
llvm/lib/Target/SystemZ/SystemZISelLowering.h | 5 ----
llvm/test/CodeGen/SystemZ/fp-half.ll | 1 -
4 files changed, 17 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index 4f736e1c171874..6be1b513d10851 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -50,8 +50,7 @@ def RetCC_SystemZ_ELF : CallingConv<[
// other floating-point argument registers available for code that
// doesn't care about the ABI. All floating-point argument registers
// are call-clobbered, so we can use all of them here.
- CCIfType<[f16], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
- CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+ CCIfType<[f16, f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
// Similarly for vectors, with V24 being the ABI-compliant choice.
@@ -116,8 +115,7 @@ def CC_SystemZ_ELF : CallingConv<[
CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D, R6D]>>,
// The first 4 float and double arguments are passed in even registers F0-F6.
- CCIfType<[f16], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
- CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+ CCIfType<[f16, f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
// The first 8 named vector arguments are passed in V24-V31. Sub-128 vectors
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 505dfa89958a0b..397d01ff9728c9 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -799,7 +799,7 @@ MVT SystemZTargetLowering::getRegisterTypeForCallingConv(
if (VT.isVector() && VT.getSizeInBits() == 128 &&
VT.getVectorNumElements() == 1)
return MVT::v16i8;
- // Keep f16 so that they can be recognized and handled.
+ // Keep f16 so it can be recognized and handled.
if (VT == MVT::f16)
return MVT::f16;
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
@@ -1625,10 +1625,13 @@ bool SystemZTargetLowering::splitValueIntoRegisterParts(
// Convert f16 to f32 (Out-arg).
if (PartVT == MVT::f16) {
- assert(NumParts == 1 && "");
- SDValue I16Val = DAG.getBitcast(MVT::i16, Val);
- SDValue I32Val = DAG.getAnyExtOrTrunc(I16Val, DL, MVT::i32);
- Parts[0] = DAG.getBitcast(MVT::f32, I32Val);
+ assert(NumParts == 1 && "f16 only needs one register.");
+ SDValue F16Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8f16,
+ DAG.getUNDEF(MVT::v8f16), Val,
+ DAG.getVectorIdxConstant(0, DL));
+ SDValue F32Vec = DAG.getBitcast(MVT::v4f32, F16Vec);
+ Parts[0] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
+ F32Vec, DAG.getVectorIdxConstant(0, DL));
return true;
}
@@ -1654,9 +1657,13 @@ static SDValue convertF32ToF16(SDValue F32Val, SelectionDAG &DAG,
const SDLoc &DL) {
assert(F32Val->getOpcode() == ISD::CopyFromReg &&
"Only expecting to handle f16 with CopyFromReg here.");
- SDValue I32Val = DAG.getBitcast(MVT::i32, F32Val);
- SDValue I16Val = DAG.getAnyExtOrTrunc(I32Val, DL, MVT::i16);
- return DAG.getBitcast(MVT::f16, I16Val);
+
+ SDValue F32Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
+ DAG.getUNDEF(MVT::v4f32), F32Val,
+ DAG.getVectorIdxConstant(0, DL));
+ SDValue F16Vec = DAG.getBitcast(MVT::v8f16, F32Vec);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16,
+ F16Vec, DAG.getVectorIdxConstant(0, DL));
}
SDValue SystemZTargetLowering::LowerFormalArguments(
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index ecffa4deea0f5a..f67d4a27cc0334 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -471,11 +471,6 @@ class SystemZTargetLowering : public TargetLowering {
}
bool softPromoteHalfType() const override { return true; }
bool useFPRegsForHalfType() const override { return true; }
- bool shouldKeepZExtForFP16Conv() const override {
- // Keep the zero extension from 16 bits if present (as with incoming
- // arguments).
- return true;
- }
bool hasInlineStackProbe(const MachineFunction &MF) const override;
AtomicExpansionKind shouldCastAtomicLoadInIR(LoadInst *LI) const override;
AtomicExpansionKind shouldCastAtomicStoreInIR(StoreInst *SI) const override;
diff --git a/llvm/test/CodeGen/SystemZ/fp-half.ll b/llvm/test/CodeGen/SystemZ/fp-half.ll
index 51ac42b012388e..6458aae7055e41 100644
--- a/llvm/test/CodeGen/SystemZ/fp-half.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-half.ll
@@ -119,7 +119,6 @@ define half @fun3(half %Op0, ptr %Dst, ptr %Src) {
; CHECK-NEXT: br %r14
entry:
store half %Op0, ptr %Dst
-
%Res = load half, ptr %Src
ret half %Res
}
>From d824b768573431cce433a1cfc5363e66f4c73893 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Tue, 29 Oct 2024 10:57:54 +0100
Subject: [PATCH 3/6] Try to make f16 legal instead
---
clang/lib/Basic/Targets/SystemZ.h | 5 +
clang/lib/Sema/SemaExpr.cpp | 2 +-
compiler-rt/test/builtins/CMakeLists.txt | 2 +-
llvm/lib/IR/RuntimeLibcalls.cpp | 2 +
.../SystemZ/AsmParser/SystemZAsmParser.cpp | 7 +
.../MCTargetDesc/SystemZMCTargetDesc.cpp | 7 +
.../MCTargetDesc/SystemZMCTargetDesc.h | 1 +
llvm/lib/Target/SystemZ/SystemZCallingConv.td | 6 +-
.../Target/SystemZ/SystemZISelLowering.cpp | 227 +++--
llvm/lib/Target/SystemZ/SystemZISelLowering.h | 16 +-
llvm/lib/Target/SystemZ/SystemZInstrFP.td | 7 +-
llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 2 +
.../lib/Target/SystemZ/SystemZRegisterInfo.td | 17 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ13.td | 4 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ14.td | 4 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ15.td | 4 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ16.td | 4 +-
.../lib/Target/SystemZ/SystemZScheduleZ196.td | 4 +-
.../Target/SystemZ/SystemZScheduleZEC12.td | 4 +-
llvm/test/CodeGen/SystemZ/fp-half.ll | 789 ++++++++++++++----
llvm/test/CodeGen/SystemZ/fp-round-03.ll | 12 +
llvm/test/CodeGen/SystemZ/fp-sincos-01.ll | 18 +
llvm/test/CodeGen/SystemZ/twoaddr-kill.mir | 8 +-
23 files changed, 906 insertions(+), 246 deletions(-)
diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h
index f665a0b72a8e38..e23f9960948fd4 100644
--- a/clang/lib/Basic/Targets/SystemZ.h
+++ b/clang/lib/Basic/Targets/SystemZ.h
@@ -93,6 +93,11 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 128;
// True if the backend supports operations on the half LLVM IR type.
+ // By setting this to false, conversions will happen for _Float16 around
+ // a statement by default with operations done in float. However, if
+ // -ffloat16-excess-precision=none is given, no conversions will be made
+ // and instead the backend will promote each half operation to float
+ // individually.
HasLegalHalfType = false;
// Allow half arguments and return values.
HalfArgsAndReturns = true;
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index d546a627a7bf68..dcf495b700540f 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -16626,7 +16626,7 @@ ExprResult Sema::BuildVAArgExpr(SourceLocation BuiltinLoc,
PromoteType = QualType();
}
}
- if (TInfo->getType()->isFloat16Type() || TInfo->getType()->isFloat32Type())
+ if (TInfo->getType()->isSpecificBuiltinType(BuiltinType::Float))
PromoteType = Context.DoubleTy;
if (!PromoteType.isNull())
DiagRuntimeBehavior(TInfo->getTypeLoc().getBeginLoc(), E,
diff --git a/compiler-rt/test/builtins/CMakeLists.txt b/compiler-rt/test/builtins/CMakeLists.txt
index 8fdcec6029a2a1..63f4c94605c907 100644
--- a/compiler-rt/test/builtins/CMakeLists.txt
+++ b/compiler-rt/test/builtins/CMakeLists.txt
@@ -56,7 +56,7 @@ foreach(arch ${BUILTIN_TEST_ARCH})
string(REPLACE ";" " " BUILTINS_TEST_TARGET_CFLAGS "${BUILTINS_TEST_TARGET_CFLAGS}")
endif()
else()
- if (${arch} MATCHES "arm|armhf|aarch64|arm64|i?86|x86_64|AMD64|riscv32|riscv64" AND COMPILER_RT_HAS_${arch}_FLOAT16)
+ if (${arch} MATCHES "arm|armhf|aarch64|arm64|i?86|x86_64|AMD64|riscv32|riscv64|s390x" AND COMPILER_RT_HAS_${arch}_FLOAT16)
list(APPEND BUILTINS_TEST_TARGET_CFLAGS -DCOMPILER_RT_HAS_FLOAT16)
string(REPLACE ";" " " BUILTINS_TEST_TARGET_CFLAGS "${BUILTINS_TEST_TARGET_CFLAGS}")
endif()
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index e38fce764b6403..496a681f1cbbad 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -247,6 +247,8 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
if (!TT.isWasm()) {
// These libcalls are only available in compiler-rt, not libgcc.
if (TT.isArch32Bit()) {
+ setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
+ setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
setLibcallName(RTLIB::SHL_I128, nullptr);
setLibcallName(RTLIB::SRL_I128, nullptr);
setLibcallName(RTLIB::SRA_I128, nullptr);
diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index e4aefc42d860f2..61de6d34c3f381 100644
--- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -60,6 +60,7 @@ enum RegisterKind {
GRH32Reg,
GR64Reg,
GR128Reg,
+ FP16Reg,
FP32Reg,
FP64Reg,
FP128Reg,
@@ -356,6 +357,7 @@ class SystemZOperand : public MCParsedAsmOperand {
bool isADDR32() const { return isReg(GR32Reg); }
bool isADDR64() const { return isReg(GR64Reg); }
bool isADDR128() const { return false; }
+ bool isFP16() const { return isReg(FP16Reg); }
bool isFP32() const { return isReg(FP32Reg); }
bool isFP64() const { return isReg(FP64Reg); }
bool isFP128() const { return isReg(FP128Reg); }
@@ -534,6 +536,9 @@ class SystemZAsmParser : public MCTargetAsmParser {
ParseStatus parseADDR128(OperandVector &Operands) {
llvm_unreachable("Shouldn't be used as an operand");
}
+ ParseStatus parseFP16(OperandVector &Operands) {
+ return parseRegister(Operands, FP16Reg);
+ }
ParseStatus parseFP32(OperandVector &Operands) {
return parseRegister(Operands, FP32Reg);
}
@@ -829,6 +834,7 @@ ParseStatus SystemZAsmParser::parseRegister(OperandVector &Operands,
case GR128Reg:
Group = RegGR;
break;
+ case FP16Reg:
case FP32Reg:
case FP64Reg:
case FP128Reg:
@@ -882,6 +888,7 @@ ParseStatus SystemZAsmParser::parseRegister(OperandVector &Operands,
case GRH32Reg: Regs = SystemZMC::GRH32Regs; break;
case GR64Reg: Regs = SystemZMC::GR64Regs; break;
case GR128Reg: Regs = SystemZMC::GR128Regs; break;
+ case FP16Reg: Regs = SystemZMC::FP16Regs; break;
case FP32Reg: Regs = SystemZMC::FP32Regs; break;
case FP64Reg: Regs = SystemZMC::FP64Regs; break;
case FP128Reg: Regs = SystemZMC::FP128Regs; break;
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 333221c46ebb8b..7b8ed4b679c01c 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -61,6 +61,13 @@ const unsigned SystemZMC::GR128Regs[16] = {
SystemZ::R12Q, 0, SystemZ::R14Q, 0
};
+const unsigned SystemZMC::FP16Regs[16] = {
+ SystemZ::F0H, SystemZ::F1H, SystemZ::F2H, SystemZ::F3H,
+ SystemZ::F4H, SystemZ::F5H, SystemZ::F6H, SystemZ::F7H,
+ SystemZ::F8H, SystemZ::F9H, SystemZ::F10H, SystemZ::F11H,
+ SystemZ::F12H, SystemZ::F13H, SystemZ::F14H, SystemZ::F15H
+};
+
const unsigned SystemZMC::FP32Regs[16] = {
SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S,
SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S,
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
index 39c1836a137005..806463707f58e6 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -43,6 +43,7 @@ extern const unsigned GR32Regs[16];
extern const unsigned GRH32Regs[16];
extern const unsigned GR64Regs[16];
extern const unsigned GR128Regs[16];
+extern const unsigned FP16Regs[16];
extern const unsigned FP32Regs[16];
extern const unsigned FP64Regs[16];
extern const unsigned FP128Regs[16];
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index 6be1b513d10851..0ad872bcb63a74 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -50,7 +50,8 @@ def RetCC_SystemZ_ELF : CallingConv<[
// other floating-point argument registers available for code that
// doesn't care about the ABI. All floating-point argument registers
// are call-clobbered, so we can use all of them here.
- CCIfType<[f16, f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+ CCIfType<[f16], CCAssignToReg<[F0H, F2H, F4H, F6H]>>,
+ CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
// Similarly for vectors, with V24 being the ABI-compliant choice.
@@ -115,7 +116,8 @@ def CC_SystemZ_ELF : CallingConv<[
CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D, R6D]>>,
// The first 4 float and double arguments are passed in even registers F0-F6.
- CCIfType<[f16, f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+ CCIfType<[f16], CCAssignToReg<[F0H, F2H, F4H, F6H]>>,
+ CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
// The first 8 named vector arguments are passed in V24-V31. Sub-128 vectors
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 397d01ff9728c9..1055ee95c5d9e2 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -102,6 +102,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::i32, &SystemZ::GR32BitRegClass);
addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass);
if (!useSoftFloat()) {
+ addRegisterClass(MVT::f16, &SystemZ::FP16BitRegClass);
if (Subtarget.hasVector()) {
addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass);
addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass);
@@ -513,11 +514,35 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
}
// Handle floating-point types.
+ // Promote all f16 operations to float, with some exceptions below.
+ for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+ setOperationAction(Opc, MVT::f16, Promote);
+ setOperationAction(ISD::ConstantFP, MVT::f16, Expand);
+ for (MVT VT : {MVT::f32, MVT::f64, MVT::f128}) {
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
+ setTruncStoreAction(VT, MVT::f16, Expand);
+ }
+ setOperationAction(ISD::LOAD, MVT::f16, Custom);
+ setOperationAction(ISD::STORE, MVT::f16, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+
for (unsigned I = MVT::FIRST_FP_VALUETYPE;
I <= MVT::LAST_FP_VALUETYPE;
++I) {
MVT VT = MVT::SimpleValueType(I);
if (isTypeLegal(VT)) {
+ // No special instructions for these.
+ setOperationAction(ISD::FSIN, VT, Expand);
+ setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
+ setOperationAction(ISD::FREM, VT, Expand);
+ setOperationAction(ISD::FPOW, VT, Expand);
+ if (VT == MVT::f16)
+ continue;
+
// We can use FI for FRINT.
setOperationAction(ISD::FRINT, VT, Legal);
@@ -530,13 +555,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FROUND, VT, Legal);
}
- // No special instructions for these.
- setOperationAction(ISD::FSIN, VT, Expand);
- setOperationAction(ISD::FCOS, VT, Expand);
- setOperationAction(ISD::FSINCOS, VT, Expand);
- setOperationAction(ISD::FREM, VT, Expand);
- setOperationAction(ISD::FPOW, VT, Expand);
-
// Special treatment.
setOperationAction(ISD::IS_FPCLASS, VT, Custom);
@@ -711,13 +729,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BITCAST, MVT::f32, Custom);
}
- // Expand FP16 <=> FP32 conversions to libcalls and handle FP16 loads and
- // stores in GPRs.
- setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
- setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
- setTruncStoreAction(MVT::f32, MVT::f16, Expand);
-
// VASTART and VACOPY need to deal with the SystemZ-specific varargs
// structure, but VAEND is a no-op.
setOperationAction(ISD::VASTART, MVT::Other, Custom);
@@ -773,6 +784,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
// Default to having -disable-strictnode-mutation on
IsStrictFPEnabled = true;
+ setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
+ setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
+
if (Subtarget.isTargetzOS()) {
struct RTLibCallMapping {
RTLIB::Libcall Code;
@@ -791,20 +805,6 @@ bool SystemZTargetLowering::useSoftFloat() const {
return Subtarget.hasSoftFloat();
}
-MVT SystemZTargetLowering::getRegisterTypeForCallingConv(
- LLVMContext &Context, CallingConv::ID CC,
- EVT VT) const {
- // 128-bit single-element vector types are passed like other vectors,
- // not like their element type.
- if (VT.isVector() && VT.getSizeInBits() == 128 &&
- VT.getVectorNumElements() == 1)
- return MVT::v16i8;
- // Keep f16 so it can be recognized and handled.
- if (VT == MVT::f16)
- return MVT::f16;
- return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
-}
-
EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL,
LLVMContext &, EVT VT) const {
if (!VT.isVector())
@@ -954,6 +954,10 @@ SystemZVectorConstantInfo::SystemZVectorConstantInfo(BuildVectorSDNode *BVN) {
bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const {
+ // TODO: Loading all f16 constants from ConstantPool for now.
+ if (&Imm.getSemantics() == &APFloat::IEEEhalf())
+ return false;
+
// We can load zero using LZ?R and negative zero using LZ?R;LC?BR.
if (Imm.isZero() || Imm.isNegZero())
return true;
@@ -1623,18 +1627,6 @@ bool SystemZTargetLowering::splitValueIntoRegisterParts(
return true;
}
- // Convert f16 to f32 (Out-arg).
- if (PartVT == MVT::f16) {
- assert(NumParts == 1 && "f16 only needs one register.");
- SDValue F16Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8f16,
- DAG.getUNDEF(MVT::v8f16), Val,
- DAG.getVectorIdxConstant(0, DL));
- SDValue F32Vec = DAG.getBitcast(MVT::v4f32, F16Vec);
- Parts[0] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
- F32Vec, DAG.getVectorIdxConstant(0, DL));
- return true;
- }
-
return false;
}
@@ -1650,22 +1642,6 @@ SDValue SystemZTargetLowering::joinRegisterPartsIntoValue(
return SDValue();
}
-// F32Val holds a f16 value in f32, return it as an f16 (In-arg). The
-// CopyFromReg was made into an f32 as required as FP32 registers are used
-// for arguments, now convert it to f16.
-static SDValue convertF32ToF16(SDValue F32Val, SelectionDAG &DAG,
- const SDLoc &DL) {
- assert(F32Val->getOpcode() == ISD::CopyFromReg &&
- "Only expecting to handle f16 with CopyFromReg here.");
-
- SDValue F32Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
- DAG.getUNDEF(MVT::v4f32), F32Val,
- DAG.getVectorIdxConstant(0, DL));
- SDValue F16Vec = DAG.getBitcast(MVT::v8f16, F32Vec);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16,
- F16Vec, DAG.getVectorIdxConstant(0, DL));
-}
-
SDValue SystemZTargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -1706,6 +1682,9 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
RC = &SystemZ::GR64BitRegClass;
break;
case MVT::f16:
+ NumFixedFPRs += 1;
+ RC = &SystemZ::FP16BitRegClass;
+ break;
case MVT::f32:
NumFixedFPRs += 1;
RC = &SystemZ::FP32BitRegClass;
@@ -1730,11 +1709,7 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
Register VReg = MRI.createVirtualRegister(RC);
MRI.addLiveIn(VA.getLocReg(), VReg);
- // Special handling is needed for f16.
- MVT ArgVT = VA.getLocVT() == MVT::f16 ? MVT::f32 : VA.getLocVT();
- ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, ArgVT);
- if (VA.getLocVT() == MVT::f16)
- ArgValue = convertF32ToF16(ArgValue, DAG, DL);
+ ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
} else {
assert(VA.isMemLoc() && "Argument not register or memory");
@@ -2072,6 +2047,8 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
VA.getLocMemOffset();
if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
Offset += 4;
+ else if (VA.getLocVT() == MVT::f16)
+ Offset += 6;
SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
DAG.getIntPtrConstant(Offset, DL));
@@ -2178,14 +2155,10 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
// Copy all of the result registers out of their specified physreg.
for (CCValAssign &VA : RetLocs) {
// Copy the value out, gluing the copy to the end of the call sequence.
- // Special handling is needed for f16.
- MVT ArgVT = VA.getLocVT() == MVT::f16 ? MVT::f32 : VA.getLocVT();
SDValue RetValue = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(),
- ArgVT, Glue);
+ VA.getLocVT(), Glue);
Chain = RetValue.getValue(1);
Glue = RetValue.getValue(2);
- if (VA.getLocVT() == MVT::f16)
- RetValue = convertF32ToF16(RetValue, DAG, DL);
// Convert the value of the return register into the value that's
// being returned.
@@ -6169,6 +6142,118 @@ static SDValue lowerAddrSpaceCast(SDValue Op, SelectionDAG &DAG) {
return Op;
}
+SDValue SystemZTargetLowering::LowerFP_EXTEND(SDValue Op,
+ SelectionDAG &DAG) const {
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue In = Op.getOperand(IsStrict ? 1 : 0);
+ MVT VT = Op.getSimpleValueType();
+ MVT SVT = In.getSimpleValueType();
+ if (SVT != MVT::f16)
+ return Op;
+
+ SDLoc DL(Op);
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+
+ // Need a libcall. XXX factor out (below)
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ Entry.Node = In;
+ Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
+ Args.push_back(Entry);
+ SDValue Callee = DAG.getExternalSymbol(
+ getLibcallName(RTLIB::FPEXT_F16_F32), getPointerTy(DAG.getDataLayout()));
+ CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
+ CallingConv::C, EVT(MVT::f32).getTypeForEVT(*DAG.getContext()), Callee,
+ std::move(Args));
+ SDValue Res;
+ std::tie(Res,Chain) = LowerCallTo(CLI);
+ if (IsStrict)
+ Res = DAG.getMergeValues({Res, Chain}, DL);
+
+ return DAG.getNode(ISD::FP_EXTEND, DL, VT, Res);
+}
+
+SDValue SystemZTargetLowering::LowerFP_ROUND(SDValue Op,
+ SelectionDAG &DAG) const {
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue In = Op.getOperand(IsStrict ? 1 : 0);
+ MVT VT = Op.getSimpleValueType();
+ MVT SVT = In.getSimpleValueType();
+ if (VT != MVT::f16)
+ return SDValue(); // XXX?
+
+ SDLoc DL(Op);
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+
+ if (SVT != MVT::f32) {
+ SDValue Rnd = DAG.getIntPtrConstant(0, DL, /*isTarget=*/true);
+ In = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, In, Rnd);
+ }
+
+ // We need a libcall.
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ Entry.Node = In;
+ Entry.Ty = EVT(MVT::f32).getTypeForEVT(*DAG.getContext());
+ Args.push_back(Entry);
+ SDValue Callee = DAG.getExternalSymbol(
+ getLibcallName(RTLIB::FPROUND_F32_F16), getPointerTy(DAG.getDataLayout()));
+ CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
+ CallingConv::C, EVT(MVT::f16).getTypeForEVT(*DAG.getContext()), Callee,
+ std::move(Args));
+ SDValue Res;
+ std::tie(Res, Chain) = LowerCallTo(CLI);
+ if (IsStrict)
+ Res = DAG.getMergeValues({Res, Chain}, DL);
+ return Res;
+}
+
+SDValue SystemZTargetLowering::lowerLoadF16(SDValue Op,
+ SelectionDAG &DAG) const {
+ MVT RegVT = Op.getSimpleValueType();
+ if (RegVT != MVT::f16)
+ return SDValue();
+ LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
+ SDLoc DL(Ld);
+ assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending f16 load");
+ // Load as integer, shift and insert into upper 2 bytes of the FP register.
+ // TODO: Use VLEH if available.
+ SDValue NewLd = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Ld->getChain(),
+ Ld->getBasePtr(), Ld->getPointerInfo(),
+ MVT::i16, Ld->getOriginalAlign(),
+ Ld->getMemOperand()->getFlags());
+ SDValue Shft = DAG.getNode(ISD::SHL, DL, MVT::i32, NewLd,
+ DAG.getConstant(16, DL, MVT::i32));
+ SDValue BCast = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Shft);
+ SDValue F16Val = DAG.getTargetExtractSubreg(SystemZ::subreg_h16,
+ DL, MVT::f16, BCast);
+ return DAG.getMergeValues({F16Val, NewLd.getValue(1)}, DL);
+}
+
+SDValue SystemZTargetLowering::lowerStoreF16(SDValue Op,
+ SelectionDAG &DAG) const {
+ StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
+ SDLoc DL(St);
+ SDValue StoredVal = St->getValue();
+ MVT StoreVT = StoredVal.getSimpleValueType();
+ if (StoreVT != MVT::f16)
+ return SDValue();
+ // Move into a GPR, shift and store the 2 bytes.
+ // TODO: Use VSTEH if available.
+ SDNode *U32 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f32);
+ SDValue In32 = DAG.getTargetInsertSubreg(SystemZ::subreg_h16, DL,
+ MVT::f32, SDValue(U32, 0), StoredVal);
+ SDValue BCast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, In32);
+ SDValue Shft = DAG.getNode(ISD::SRL, DL, MVT::i32, BCast,
+ DAG.getConstant(16, DL, MVT::i32));
+ return DAG.getTruncStore(St->getChain(), DL, Shft, St->getBasePtr(),
+ MVT::i16, St->getMemOperand());
+}
+
SDValue SystemZTargetLowering::lowerIS_FPCLASS(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
@@ -6346,6 +6431,16 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
return lowerAddrSpaceCast(Op, DAG);
case ISD::ROTL:
return lowerShift(Op, DAG, SystemZISD::VROTL_BY_SCALAR);
+ case ISD::FP_EXTEND:
+//case ISD::STRICT_FP_EXTEND:
+ return LowerFP_EXTEND(Op, DAG);
+ case ISD::FP_ROUND:
+//case ISD::STRICT_FP_ROUND:
+ return LowerFP_ROUND(Op, DAG);
+ case ISD::LOAD:
+ return lowerLoadF16(Op, DAG);
+ case ISD::STORE:
+ return lowerStoreF16(Op, DAG);
case ISD::IS_FPCLASS:
return lowerIS_FPCLASS(Op, DAG);
case ISD::GET_ROUNDING:
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index f67d4a27cc0334..36c5a78dd8027f 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -447,7 +447,14 @@ class SystemZTargetLowering : public TargetLowering {
return TargetLowering::getNumRegisters(Context, VT);
}
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
- EVT VT) const override;
+ EVT VT) const override {
+ // 128-bit single-element vector types are passed like other vectors,
+ // not like their element type.
+ if (VT.isVector() && VT.getSizeInBits() == 128 &&
+ VT.getVectorNumElements() == 1)
+ return MVT::v16i8;
+ return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+ }
bool isCheapToSpeculateCtlz(Type *) const override { return true; }
bool isCheapToSpeculateCttz(Type *) const override { return true; }
bool preferZeroCompareBranch() const override { return true; }
@@ -469,8 +476,6 @@ class SystemZTargetLowering : public TargetLowering {
// LD, and having the full constant in memory enables reg/mem opcodes.
return VT != MVT::f64;
}
- bool softPromoteHalfType() const override { return true; }
- bool useFPRegsForHalfType() const override { return true; }
bool hasInlineStackProbe(const MachineFunction &MF) const override;
AtomicExpansionKind shouldCastAtomicLoadInIR(LoadInst *LI) const override;
AtomicExpansionKind shouldCastAtomicStoreInIR(StoreInst *SI) const override;
@@ -714,6 +719,11 @@ class SystemZTargetLowering : public TargetLowering {
SDValue lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
+ SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerLoadF16(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerStoreF16(SDValue Op, SelectionDAG &DAG) const;
+
SDValue lowerIS_FPCLASS(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
index aad04a2b4159cb..aa9d429e271ced 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -47,8 +47,11 @@ def LDR : UnaryRR <"ldr", 0x28, null_frag, FP64, FP64>;
def LXR : UnaryRRE<"lxr", 0xB365, null_frag, FP128, FP128>;
// For z13 we prefer LDR over LER to avoid partial register dependencies.
-let isCodeGenOnly = 1 in
- def LDR32 : UnaryRR<"ldr", 0x28, null_frag, FP32, FP32>;
+let isCodeGenOnly = 1 in {
+ def LER16 : UnaryRR <"ler", 0x38, null_frag, FP16, FP16>;
+ def LDR16 : UnaryRR<"ldr", 0x28, null_frag, FP16, FP16>;
+ def LDR32 : UnaryRR<"ldr", 0x28, null_frag, FP32, FP32>;
+}
// Moves between two floating-point registers that also set the condition
// codes. Note that these instructions will turn SNaNs into QNaNs and should
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index d553c72589f599..1c493afbe620eb 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -968,6 +968,8 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
unsigned Opcode;
if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg))
Opcode = SystemZ::LGR;
+ else if (SystemZ::FP16BitRegClass.contains(DestReg, SrcReg))
+ Opcode = STI.hasVector() ? SystemZ::LDR16 : SystemZ::LER16;
else if (SystemZ::FP32BitRegClass.contains(DestReg, SrcReg))
// For z13 we prefer LDR over LER to avoid partial register dependencies.
Opcode = STI.hasVector() ? SystemZ::LDR32 : SystemZ::LER;
diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
index 8f9bb56f2eb3bb..cb5147aca86af6 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -20,6 +20,7 @@ class SystemZRegWithSubregs<string n, list<Register> subregs>
}
let Namespace = "SystemZ" in {
+def subreg_h16 : SubRegIndex<16, 16>;
def subreg_l32 : SubRegIndex<32, 0>; // Also acts as subreg_hl32.
def subreg_h32 : SubRegIndex<32, 32>; // Also acts as subreg_hh32.
def subreg_l64 : SubRegIndex<64, 0>;
@@ -201,9 +202,16 @@ def F27Dwarf : DwarfMapping<81>;
def F29Dwarf : DwarfMapping<82>;
def F31Dwarf : DwarfMapping<83>;
+// Upper 16 bits of one of the floating-point registers
+class FPR16<bits<16> num, string n> : SystemZReg<n> {
+ let HWEncoding = num;
+}
+
// Upper 32 bits of one of the floating-point registers
-class FPR32<bits<16> num, string n> : SystemZReg<n> {
+class FPR32<bits<16> num, string n, FPR16 high>
+ : SystemZRegWithSubregs<n, [high]> {
let HWEncoding = num;
+ let SubRegIndices = [subreg_h16];
}
// One of the floating-point registers.
@@ -223,12 +231,14 @@ class FPR128<bits<16> num, string n, FPR64 low, FPR64 high>
// Floating-point registers. Registers 16-31 require the vector facility.
foreach I = 0-15 in {
- def F#I#S : FPR32<I, "f"#I>;
+ def F#I#H : FPR16<I, "f"#I>;
+ def F#I#S : FPR32<I, "f"#I, !cast<FPR16>("F"#I#"H")>;
def F#I#D : FPR64<I, "f"#I, !cast<FPR32>("F"#I#"S")>,
DwarfRegNum<[!cast<DwarfMapping>("F"#I#"Dwarf").Id]>;
}
foreach I = 16-31 in {
- def F#I#S : FPR32<I, "v"#I>;
+ def F#I#H : FPR16<I, "v"#I>;
+ def F#I#S : FPR32<I, "v"#I, !cast<FPR16>("F"#I#"H")>;
def F#I#D : FPR64<I, "v"#I, !cast<FPR32>("F"#I#"S")>,
DwarfRegNum<[!cast<DwarfMapping>("F"#I#"Dwarf").Id]>;
}
@@ -240,6 +250,7 @@ foreach I = [0, 1, 4, 5, 8, 9, 12, 13] in {
// There's no store-multiple instruction for FPRs, so we're not fussy
// about the order in which call-saved registers are allocated.
+defm FP16 : SystemZRegClass<"FP16", [f16], 16, (sequence "F%uH", 0, 15)>;
defm FP32 : SystemZRegClass<"FP32", [f32], 32, (sequence "F%uS", 0, 15)>;
defm FP64 : SystemZRegClass<"FP64", [f64], 64, (sequence "F%uD", 0, 15)>;
defm FP128 : SystemZRegClass<"FP128", [f128], 128,
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
index d0fec02777875a..c36bd8d1dd30a7 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -777,8 +777,8 @@ def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>;
def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>;
// Load
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER(16)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R16|R32|GR)$")>;
def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>;
def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>;
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
index a6d89ce9443c5a..ba08bd8077e12d 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
@@ -797,8 +797,8 @@ def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>;
def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>;
// Load
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER(16)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R16|R32|GR)$")>;
def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>;
def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>;
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
index 455354e283ad8e..71474b674dad80 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
@@ -815,8 +815,8 @@ def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>;
def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>;
// Load
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER(16)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R16|R32|GR)$")>;
def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>;
def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>;
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
index 92abf0ba4022cc..c324b2f37641bc 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
@@ -816,8 +816,8 @@ def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>;
def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>;
// Load
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER(16)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R16|R32|GR)$")>;
def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>;
def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>;
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
index 99d0d674bbbb2f..2d09d489c8228e 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -709,8 +709,8 @@ def : InstRW<[WLat1, FXU, NormalGr], (instregex "LZ(DR|ER)$")>;
def : InstRW<[WLat2, FXU2, GroupAlone2], (instregex "LZXR$")>;
// Load
-def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER$")>;
-def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER(16)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R16|R32|GR)$")>;
def : InstRW<[WLat3, FXU, NormalGr], (instregex "LGDR$")>;
def : InstRW<[WLat2, FXU2, GroupAlone2], (instregex "LXR$")>;
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td b/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
index 5b334da2bac342..5af031ce73baad 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -747,8 +747,8 @@ def : InstRW<[WLat1, FXU, NormalGr], (instregex "LZ(DR|ER)$")>;
def : InstRW<[WLat2, FXU2, GroupAlone], (instregex "LZXR$")>;
// Load
-def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER$")>;
-def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER(16)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R16|R32|GR)$")>;
def : InstRW<[WLat3, FXU, NormalGr], (instregex "LGDR$")>;
def : InstRW<[WLat2, FXU2, GroupAlone], (instregex "LXR$")>;
diff --git a/llvm/test/CodeGen/SystemZ/fp-half.ll b/llvm/test/CodeGen/SystemZ/fp-half.ll
index 6458aae7055e41..cbc680f90d160f 100644
--- a/llvm/test/CodeGen/SystemZ/fp-half.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-half.ll
@@ -1,107 +1,439 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=NOVEC
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=VECTOR
;
; Tests for 16-bit floating point (half).
+
; Incoming half arguments added together and returned.
define half @fun0(half %Op0, half %Op1) {
-; CHECK-LABEL: fun0:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
-; CHECK-NEXT: .cfi_offset %r13, -56
-; CHECK-NEXT: .cfi_offset %r14, -48
-; CHECK-NEXT: .cfi_offset %r15, -40
-; CHECK-NEXT: aghi %r15, -168
-; CHECK-NEXT: .cfi_def_cfa_offset 328
-; CHECK-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
-; CHECK-NEXT: .cfi_offset %f8, -168
-; CHECK-NEXT: vlgvf %r0, %v2, 0
-; CHECK-NEXT: llghr %r2, %r0
-; CHECK-NEXT: vlgvf %r13, %v0, 0
-; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
-; CHECK-NEXT: llghr %r2, %r13
-; CHECK-NEXT: ldr %f8, %f0
-; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
-; CHECK-NEXT: aebr %f0, %f8
-; CHECK-NEXT: brasl %r14, __gnu_f2h_ieee at PLT
-; CHECK-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
-; CHECK-NEXT: vlvgf %v0, %r2, 0
-; CHECK-NEXT: lmg %r13, %r15, 272(%r15)
-; CHECK-NEXT: br %r14
+; NOVEC-LABEL: fun0:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -176
+; NOVEC-NEXT: .cfi_def_cfa_offset 336
+; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: ler %f8, %f0
+; NOVEC-NEXT: ler %f0, %f2
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f9
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 288(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun0:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -176
+; VECTOR-NEXT: .cfi_def_cfa_offset 336
+; VECTOR-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: ldr %f0, %f2
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f9
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 288(%r15)
+; VECTOR-NEXT: br %r14
entry:
%Res = fadd half %Op0, %Op1
ret half %Res
}
+define half @fun1(half %Op0, half %Op1) {
+; NOVEC-LABEL: fun1:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -176
+; NOVEC-NEXT: .cfi_def_cfa_offset 336
+; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: ler %f8, %f2
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ldebr %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ldebr %f0, %f0
+; NOVEC-NEXT: adbr %f0, %f9
+; NOVEC-NEXT: ledbr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 288(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun1:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -176
+; VECTOR-NEXT: .cfi_def_cfa_offset 336
+; VECTOR-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: ldr %f8, %f2
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldebr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldebr %f0, %f0
+; VECTOR-NEXT: wfadb %f0, %f9, %f0
+; VECTOR-NEXT: ledbra %f0, 0, %f0, 0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 288(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %E0 = fpext half %Op0 to double
+ %E1 = fpext half %Op1 to double
+ %Add = fadd double %E0, %E1
+ %Res = fptrunc double %Add to half
+ ret half %Res
+}
+
+define half @fun2(half %Op0, half %Op1) {
+; NOVEC-LABEL: fun2:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -184
+; NOVEC-NEXT: .cfi_def_cfa_offset 344
+; NOVEC-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f11, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: .cfi_offset %f11, -184
+; NOVEC-NEXT: ler %f8, %f2
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: lxebr %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: lxebr %f0, %f0
+; NOVEC-NEXT: axbr %f0, %f9
+; NOVEC-NEXT: lexbr %f0, %f0
+; NOVEC-NEXT: # kill: def $f0s killed $f0s killed $f0q
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f11, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 296(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun2:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -184
+; VECTOR-NEXT: .cfi_def_cfa_offset 344
+; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: ldr %f8, %f2
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldebr %f0, %f0
+; VECTOR-NEXT: wflld %v0, %f0
+; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldebr %f0, %f0
+; VECTOR-NEXT: wflld %v0, %f0
+; VECTOR-NEXT: vl %v1, 160(%r15), 3 # 16-byte Folded Reload
+; VECTOR-NEXT: wfaxb %v0, %v1, %v0
+; VECTOR-NEXT: wflrx %f0, %v0, 0, 3
+; VECTOR-NEXT: ledbra %f0, 0, %f0, 0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 296(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %E0 = fpext half %Op0 to fp128
+ %E1 = fpext half %Op1 to fp128
+ %Add = fadd fp128 %E0, %E1
+ %Res = fptrunc fp128 %Add to half
+ ret half %Res
+}
+
; The half values are loaded and stored instead.
-define void @fun1(ptr %Op0, ptr %Op1, ptr %Dst) {
-; CHECK-LABEL: fun1:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: stmg %r12, %r15, 96(%r15)
-; CHECK-NEXT: .cfi_offset %r12, -64
-; CHECK-NEXT: .cfi_offset %r13, -56
-; CHECK-NEXT: .cfi_offset %r14, -48
-; CHECK-NEXT: .cfi_offset %r15, -40
-; CHECK-NEXT: aghi %r15, -168
-; CHECK-NEXT: .cfi_def_cfa_offset 328
-; CHECK-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
-; CHECK-NEXT: .cfi_offset %f8, -168
-; CHECK-NEXT: llgh %r12, 0(%r2)
-; CHECK-NEXT: llgh %r2, 0(%r3)
-; CHECK-NEXT: lgr %r13, %r4
-; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
-; CHECK-NEXT: lgr %r2, %r12
-; CHECK-NEXT: ldr %f8, %f0
-; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
-; CHECK-NEXT: aebr %f0, %f8
-; CHECK-NEXT: brasl %r14, __gnu_f2h_ieee at PLT
-; CHECK-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
-; CHECK-NEXT: sth %r2, 0(%r13)
-; CHECK-NEXT: lmg %r12, %r15, 264(%r15)
-; CHECK-NEXT: br %r14
+define void @fun3(ptr %Src, ptr %Dst) {
+; NOVEC-LABEL: fun3:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r13, %r15, 104(%r15)
+; NOVEC-NEXT: .cfi_offset %r13, -56
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -160
+; NOVEC-NEXT: .cfi_def_cfa_offset 320
+; NOVEC-NEXT: lh %r0, 0(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: lgr %r13, %r3
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r13)
+; NOVEC-NEXT: lmg %r13, %r15, 264(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun3:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: lh %r0, 0(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $f0s
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $f0s
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 0(%r13)
+; VECTOR-NEXT: lmg %r13, %r15, 264(%r15)
+; VECTOR-NEXT: br %r14
entry:
- %0 = load half, ptr %Op0, align 2
- %1 = load half, ptr %Op1, align 2
- %add = fadd half %0, %1
- store half %add, ptr %Dst, align 2
+ %Op0 = load half, ptr %Src, align 2
+ %Add = fadd half %Op0, %Op0
+ store half %Add, ptr %Dst, align 2
+ ret void
+}
+
+define void @fun4(ptr %Src, ptr %Dst) {
+; NOVEC-LABEL: fun4:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r13, %r15, 104(%r15)
+; NOVEC-NEXT: .cfi_offset %r13, -56
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -160
+; NOVEC-NEXT: .cfi_def_cfa_offset 320
+; NOVEC-NEXT: lh %r0, 0(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: lgr %r13, %r3
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ldebr %f0, %f0
+; NOVEC-NEXT: adbr %f0, %f0
+; NOVEC-NEXT: ledbr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r13)
+; NOVEC-NEXT: lmg %r13, %r15, 264(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun4:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: lh %r0, 0(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $f0s
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldebr %f0, %f0
+; VECTOR-NEXT: adbr %f0, %f0
+; VECTOR-NEXT: ledbra %f0, 0, %f0, 0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $f0s
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 0(%r13)
+; VECTOR-NEXT: lmg %r13, %r15, 264(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %Op0 = load half, ptr %Src, align 2
+ %E0 = fpext half %Op0 to double
+ %Add = fadd double %E0, %E0
+ %Res = fptrunc double %Add to half
+ store half %Res, ptr %Dst, align 2
+ ret void
+}
+
+define void @fun5(ptr %Src, ptr %Dst) {
+; NOVEC-LABEL: fun5:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r13, %r15, 104(%r15)
+; NOVEC-NEXT: .cfi_offset %r13, -56
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -160
+; NOVEC-NEXT: .cfi_def_cfa_offset 320
+; NOVEC-NEXT: lh %r0, 0(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: lgr %r13, %r3
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: lxebr %f0, %f0
+; NOVEC-NEXT: axbr %f0, %f0
+; NOVEC-NEXT: lexbr %f0, %f0
+; NOVEC-NEXT: # kill: def $f0s killed $f0s killed $f0q
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r13)
+; NOVEC-NEXT: lmg %r13, %r15, 264(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun5:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: lh %r0, 0(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $f0s
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldebr %f0, %f0
+; VECTOR-NEXT: wflld %v0, %f0
+; VECTOR-NEXT: wfaxb %v0, %v0, %v0
+; VECTOR-NEXT: wflrx %f0, %v0, 0, 3
+; VECTOR-NEXT: ledbra %f0, 0, %f0, 0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $f0s
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 0(%r13)
+; VECTOR-NEXT: lmg %r13, %r15, 264(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %Op0 = load half, ptr %Src, align 2
+ %E0 = fpext half %Op0 to fp128
+ %Add = fadd fp128 %E0, %E0
+ %Res = fptrunc fp128 %Add to half
+ store half %Res, ptr %Dst, align 2
ret void
}
; Test a chain of half operations which should have each operation surrounded
; by conversions to/from fp32 for proper emulation.
-define half @fun2(half %Op0, half %Op1, half %Op2) {
-; CHECK-LABEL: fun2:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: stmg %r12, %r15, 96(%r15)
-; CHECK-NEXT: .cfi_offset %r12, -64
-; CHECK-NEXT: .cfi_offset %r13, -56
-; CHECK-NEXT: .cfi_offset %r14, -48
-; CHECK-NEXT: .cfi_offset %r15, -40
-; CHECK-NEXT: aghi %r15, -168
-; CHECK-NEXT: .cfi_def_cfa_offset 328
-; CHECK-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
-; CHECK-NEXT: .cfi_offset %f8, -168
-; CHECK-NEXT: vlgvf %r0, %v2, 0
-; CHECK-NEXT: llghr %r2, %r0
-; CHECK-NEXT: vlgvf %r13, %v4, 0
-; CHECK-NEXT: vlgvf %r12, %v0, 0
-; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
-; CHECK-NEXT: llghr %r2, %r12
-; CHECK-NEXT: ldr %f8, %f0
-; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
-; CHECK-NEXT: aebr %f0, %f8
-; CHECK-NEXT: brasl %r14, __gnu_f2h_ieee at PLT
-; CHECK-NEXT: llghr %r2, %r2
-; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
-; CHECK-NEXT: llghr %r2, %r13
-; CHECK-NEXT: ldr %f8, %f0
-; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
-; CHECK-NEXT: wfasb %f0, %f8, %f0
-; CHECK-NEXT: brasl %r14, __gnu_f2h_ieee at PLT
-; CHECK-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
-; CHECK-NEXT: vlvgf %v0, %r2, 0
-; CHECK-NEXT: lmg %r12, %r15, 264(%r15)
-; CHECK-NEXT: br %r14
+define half @fun6(half %Op0, half %Op1, half %Op2) {
+; NOVEC-LABEL: fun6:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -184
+; NOVEC-NEXT: .cfi_def_cfa_offset 344
+; NOVEC-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f10, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: .cfi_offset %f10, -184
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f2
+; NOVEC-NEXT: ler %f8, %f4
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ler %f10, %f0
+; NOVEC-NEXT: ler %f0, %f9
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f10
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f9
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 296(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun6:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -184
+; VECTOR-NEXT: .cfi_def_cfa_offset 344
+; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 168(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f10, 160(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: .cfi_offset %f10, -184
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f2
+; VECTOR-NEXT: ldr %f8, %f4
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f10, %f0
+; VECTOR-NEXT: ldr %f0, %f9
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f10
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: wfasb %f0, %f9, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 296(%r15)
+; VECTOR-NEXT: br %r14
entry:
%A0 = fadd half %Op0, %Op1
%Res = fadd half %A0, %Op2
@@ -109,14 +441,32 @@ entry:
}
; Store an incoming half argument and return a loaded one.
-define half @fun3(half %Op0, ptr %Dst, ptr %Src) {
-; CHECK-LABEL: fun3:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vlgvf %r0, %v0, 0
-; CHECK-NEXT: sth %r0, 0(%r2)
-; CHECK-NEXT: lh %r0, 0(%r3)
-; CHECK-NEXT: vlvgf %v0, %r0, 0
-; CHECK-NEXT: br %r14
+define half @fun7(half %Op0, ptr %Dst, ptr %Src) {
+; NOVEC-LABEL: fun7:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r2)
+; NOVEC-NEXT: lh %r0, 0(%r3)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun7:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $f0s
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 0(%r2)
+; VECTOR-NEXT: lh %r0, 0(%r3)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $f0s
+; VECTOR-NEXT: br %r14
entry:
store half %Op0, ptr %Dst
%Res = load half, ptr %Src
@@ -125,23 +475,50 @@ entry:
; Call a function with half argument and return values.
declare half @foo(half)
-define void @fun4(ptr %Src, ptr %Dst) {
-; CHECK-LABEL: fun4:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
-; CHECK-NEXT: .cfi_offset %r13, -56
-; CHECK-NEXT: .cfi_offset %r14, -48
-; CHECK-NEXT: .cfi_offset %r15, -40
-; CHECK-NEXT: aghi %r15, -160
-; CHECK-NEXT: .cfi_def_cfa_offset 320
-; CHECK-NEXT: lh %r0, 0(%r2)
-; CHECK-NEXT: vlvgf %v0, %r0, 0
-; CHECK-NEXT: lgr %r13, %r3
-; CHECK-NEXT: brasl %r14, foo at PLT
-; CHECK-NEXT: vlgvf %r0, %v0, 0
-; CHECK-NEXT: sth %r0, 0(%r13)
-; CHECK-NEXT: lmg %r13, %r15, 264(%r15)
-; CHECK-NEXT: br %r14
+define void @fun8(ptr %Src, ptr %Dst) {
+; NOVEC-LABEL: fun8:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r13, %r15, 104(%r15)
+; NOVEC-NEXT: .cfi_offset %r13, -56
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -160
+; NOVEC-NEXT: .cfi_def_cfa_offset 320
+; NOVEC-NEXT: lh %r0, 0(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: lgr %r13, %r3
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: brasl %r14, foo at PLT
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r13)
+; NOVEC-NEXT: lmg %r13, %r15, 264(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun8:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: lh %r0, 0(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $f0s
+; VECTOR-NEXT: brasl %r14, foo at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $f0s
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 0(%r13)
+; VECTOR-NEXT: lmg %r13, %r15, 264(%r15)
+; VECTOR-NEXT: br %r14
entry:
%arg = load half, ptr %Src
%Res = call half @foo(half %arg)
@@ -150,51 +527,159 @@ entry:
}
; Receive stack argument.
-define half @bar(half %Arg0, half %Arg1, half %Arg2, half %Arg3, half %Arg4) {
-; CHECK-LABEL: bar:
-; CHECK: # %bb.0:
-; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
-; CHECK-NEXT: .cfi_offset %r13, -56
-; CHECK-NEXT: .cfi_offset %r14, -48
-; CHECK-NEXT: .cfi_offset %r15, -40
-; CHECK-NEXT: aghi %r15, -168
-; CHECK-NEXT: .cfi_def_cfa_offset 328
-; CHECK-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
-; CHECK-NEXT: .cfi_offset %f8, -168
-; CHECK-NEXT: vlgvf %r0, %v6, 0
-; CHECK-NEXT: llgh %r13, 334(%r15)
-; CHECK-NEXT: llghr %r2, %r0
-; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
-; CHECK-NEXT: lgr %r2, %r13
-; CHECK-NEXT: ldr %f8, %f0
-; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
-; CHECK-NEXT: wfasb %f0, %f8, %f0
-; CHECK-NEXT: brasl %r14, __gnu_f2h_ieee at PLT
-; CHECK-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
-; CHECK-NEXT: vlvgf %v0, %r2, 0
-; CHECK-NEXT: lmg %r13, %r15, 272(%r15)
-; CHECK-NEXT: br %r14
+define half @fun9(half %Arg0, half %Arg1, half %Arg2, half %Arg3, half %Arg4) {
+; NOVEC-LABEL: fun9:
+; NOVEC: # %bb.0:
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -176
+; NOVEC-NEXT: .cfi_def_cfa_offset 336
+; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: lh %r0, 342(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ler %f8, %f6
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f9
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 288(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun9:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -176
+; VECTOR-NEXT: .cfi_def_cfa_offset 336
+; VECTOR-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: lh %r0, 342(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: ldr %f8, %f6
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $f0s
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f9
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 288(%r15)
+; VECTOR-NEXT: br %r14
%A0 = fadd half %Arg3, %Arg4
ret half %A0
}
; Pass stack argument.
-define void @fun5() {
-; CHECK-LABEL: fun5:
-; CHECK: # %bb.0:
-; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
-; CHECK-NEXT: .cfi_offset %r14, -48
-; CHECK-NEXT: .cfi_offset %r15, -40
-; CHECK-NEXT: aghi %r15, -168
-; CHECK-NEXT: .cfi_def_cfa_offset 328
-; CHECK-NEXT: lzer %f0
-; CHECK-NEXT: ldr %f2, %f0
-; CHECK-NEXT: ldr %f4, %f0
-; CHECK-NEXT: ldr %f6, %f0
-; CHECK-NEXT: mvhi 164(%r15), 0
-; CHECK-NEXT: brasl %r14, bar at PLT
-; CHECK-NEXT: lmg %r14, %r15, 280(%r15)
-; CHECK-NEXT: br %r14
- call void @bar (half 0.0, half 0.0, half 0.0, half 0.0, half 0.0)
+define void @fun10(half %Arg0) {
+; NOVEC-LABEL: fun10:
+; NOVEC: # %bb.0:
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -168
+; NOVEC-NEXT: .cfi_def_cfa_offset 328
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: ler %f2, %f0
+; NOVEC-NEXT: ler %f4, %f0
+; NOVEC-NEXT: ler %f6, %f0
+; NOVEC-NEXT: sth %r0, 166(%r15)
+; NOVEC-NEXT: brasl %r14, fun9 at PLT
+; NOVEC-NEXT: lmg %r14, %r15, 280(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun10:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -168
+; VECTOR-NEXT: .cfi_def_cfa_offset 328
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $f0s
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: ldr %f2, %f0
+; VECTOR-NEXT: ldr %f4, %f0
+; VECTOR-NEXT: ldr %f6, %f0
+; VECTOR-NEXT: sth %r0, 166(%r15)
+; VECTOR-NEXT: brasl %r14, fun9 at PLT
+; VECTOR-NEXT: lmg %r14, %r15, 280(%r15)
+; VECTOR-NEXT: br %r14
+ call void @fun9(half %Arg0, half %Arg0, half %Arg0, half %Arg0, half %Arg0)
+ ret void
+}
+
+; Test loading some immediates from the Constant Pool.
+declare void @foo2(half, half, half)
+define void @fun11() {
+; NOVEC-LABEL: fun11:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -160
+; NOVEC-NEXT: .cfi_def_cfa_offset 320
+; NOVEC-NEXT: lhrl %r0, .LCPI11_0
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: lhrl %r0, .LCPI11_1
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f2, %r0
+; NOVEC-NEXT: # kill: def $f2h killed $f2h killed $f2d
+; NOVEC-NEXT: lhrl %r0, .LCPI11_2
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f4, %r0
+; NOVEC-NEXT: # kill: def $f4h killed $f4h killed $f4d
+; NOVEC-NEXT: brasl %r14, foo2 at PLT
+; NOVEC-NEXT: lmg %r14, %r15, 272(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun11:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: lhrl %r0, .LCPI11_0
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: lhrl %r0, .LCPI11_1
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v2, %r0, 0
+; VECTOR-NEXT: lhrl %r0, .LCPI11_2
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v4, %r0, 0
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $f0s
+; VECTOR-NEXT: # kill: def $f2h killed $f2h killed $f2s
+; VECTOR-NEXT: # kill: def $f4h killed $f4h killed $f4s
+; VECTOR-NEXT: brasl %r14, foo2 at PLT
+; VECTOR-NEXT: lmg %r14, %r15, 272(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ call half @foo2(half 0.0, half 1.0, half 0.375)
ret void
}
diff --git a/llvm/test/CodeGen/SystemZ/fp-round-03.ll b/llvm/test/CodeGen/SystemZ/fp-round-03.ll
index d35cafc406ad77..837a95705b5f85 100644
--- a/llvm/test/CodeGen/SystemZ/fp-round-03.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-round-03.ll
@@ -2,6 +2,18 @@
;
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+; Test that an f16 intrinsic can be lowered with promotion to float.
+declare half @llvm.rint.f16(half %f)
+define half @f0(half %f) {
+; CHECK-LABEL: f0:
+; CHECK: brasl %r14, __extendhfsf2 at PLT
+; CHECK: fiebra %f0, 0, %f0, 0
+; CHECK: brasl %r14, __truncsfhf2 at PLT
+; CHECK: br %r14
+ %res = call half @llvm.rint.f16(half %f)
+ ret half %res
+}
+
; Test rint for f32.
declare float @llvm.rint.f32(float %f)
define float @f1(float %f) {
diff --git a/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll b/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
index 4a38d7afba2c9d..36b3ef67c9e75a 100644
--- a/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
@@ -3,6 +3,22 @@
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s --check-prefix=CHECK-OPT
; RUN: llc < %s -mtriple=s390x-linux-gnu -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK-OPT
+; Test f16 libcalls.
+define half @f0(half %x) {
+; CHECK-OPT-LABEL: f0:
+; CHECK-OPT-NOT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-OPT: brasl %r14, sinh at PLT
+; CHECK-OPT: brasl %r14, cosh at PLT
+; CHECK-OPT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-OPT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-OPT: aebr %f0, %f8
+; CHECK-OPT: brasl %r14, __truncsfhf2 at PLT
+ %tmp1 = call half @sinh(half %x) readnone
+ %tmp2 = call half @cosh(half %x) readnone
+ %add = fadd half %tmp1, %tmp2
+ ret half %add
+}
+
define float @f1(float %x) {
; CHECK-OPT-LABEL: f1:
; CHECK-OPT: brasl %r14, sincosf at PLT
@@ -70,9 +86,11 @@ define fp128 @f3_errno(fp128 %x) {
ret fp128 %add
}
+declare half @sinh(half)
declare float @sinf(float)
declare double @sin(double)
declare fp128 @sinl(fp128)
+declare half @cosh(half)
declare float @cosf(float)
declare double @cos(double)
declare fp128 @cosl(fp128)
diff --git a/llvm/test/CodeGen/SystemZ/twoaddr-kill.mir b/llvm/test/CodeGen/SystemZ/twoaddr-kill.mir
index 7fc7bd3e347bb5..fb1448497c1904 100644
--- a/llvm/test/CodeGen/SystemZ/twoaddr-kill.mir
+++ b/llvm/test/CodeGen/SystemZ/twoaddr-kill.mir
@@ -18,19 +18,19 @@ body: |
; CHECK-NEXT: $r2l = COPY [[COPY]]
; CHECK-NEXT: $r3l = COPY killed [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]]:grh32bit = COPY killed [[COPY1]]
- ; CHECK-NEXT: INLINEASM &"stepa $1, $2, $3", 0 /* attdialect */, 393226 /* regdef:GRH32Bit */, def [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3), 9 /* reguse */, killed $r2l, 9 /* reguse */, killed $r3l
+ ; CHECK-NEXT: INLINEASM &"stepa $1, $2, $3", 0 /* attdialect */, 458762 /* regdef:GRH32Bit */, def [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3), 9 /* reguse */, killed $r2l, 9 /* reguse */, killed $r3l
; CHECK-NEXT: [[COPY3:%[0-9]+]]:grh32bit = COPY killed [[COPY2]]
; CHECK-NEXT: [[COPY4:%[0-9]+]]:grh32bit = COPY [[COPY3]]
- ; CHECK-NEXT: INLINEASM &"stepb $1, $2", 0 /* attdialect */, 393227 /* regdef-ec:GRH32Bit */, def early-clobber [[COPY4]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 393225 /* reguse:GRH32Bit */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"stepb $1, $2", 0 /* attdialect */, 458763 /* regdef-ec:GRH32Bit */, def early-clobber [[COPY4]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 458761 /* reguse:GRH32Bit */, [[COPY3]]
; CHECK-NEXT: $r2l = COPY killed [[COPY4]]
; CHECK-NEXT: Return implicit killed $r2l
%0:gr32bit = COPY killed $r2l
%2:grh32bit = COPY %0
$r2l = COPY %0
$r3l = COPY killed %0
- INLINEASM &"stepa $1, $2, $3", 0 /* attdialect */, 393226 /* regdef:GRH32Bit */, def %1:grh32bit, 2147483657 /* reguse tiedto:$0 */, killed %2(tied-def 3), 9 /* reguse */, killed $r2l, 9 /* reguse */, killed $r3l
+ INLINEASM &"stepa $1, $2, $3", 0 /* attdialect */, 458762 /* regdef:GRH32Bit */, def %1:grh32bit, 2147483657 /* reguse tiedto:$0 */, killed %2(tied-def 3), 9 /* reguse */, killed $r2l, 9 /* reguse */, killed $r3l
%4:grh32bit = COPY killed %1
- INLINEASM &"stepb $1, $2", 0 /* attdialect */, 393227 /* regdef-ec:GRH32Bit */, def early-clobber %3:grh32bit, 2147483657 /* reguse tiedto:$0 */, killed %4(tied-def 3), 393225 /* reguse:GRH32Bit */, %4
+ INLINEASM &"stepb $1, $2", 0 /* attdialect */, 458763 /* regdef-ec:GRH32Bit */, def early-clobber %3:grh32bit, 2147483657 /* reguse tiedto:$0 */, killed %4(tied-def 3), 458761 /* reguse:GRH32Bit */, %4
$r2l = COPY killed %3
Return implicit killed $r2l
...
>From 158fa24803d44f448592f64a44c0900ee629eb31 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Tue, 5 Nov 2024 11:16:30 -0600
Subject: [PATCH 4/6] Atomic loads/stores, spill/reload, tests for __fp16 and
half vectors.
---
clang/lib/Basic/Targets/SystemZ.h | 4 +-
clang/lib/CodeGen/Targets/SystemZ.cpp | 2 +-
.../{fexcess-precision.c => Float16.c} | 0
clang/test/CodeGen/SystemZ/fp16.c | 32 +
clang/test/CodeGen/SystemZ/systemz-abi.c | 43 +
.../Target/SystemZ/SystemZISelLowering.cpp | 60 +-
llvm/lib/Target/SystemZ/SystemZISelLowering.h | 2 +
llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 46 +
llvm/test/CodeGen/SystemZ/atomic-load-10.ll | 16 +
llvm/test/CodeGen/SystemZ/atomic-store-10.ll | 17 +
llvm/test/CodeGen/SystemZ/fp-half-vector.ll | 1050 +++++++++++++++++
llvm/test/CodeGen/SystemZ/fp-half.ll | 3 +-
llvm/test/CodeGen/SystemZ/fp-sincos-01.ll | 2 +-
llvm/test/CodeGen/SystemZ/spill-half.mir | 39 +
14 files changed, 1297 insertions(+), 19 deletions(-)
rename clang/test/CodeGen/SystemZ/{fexcess-precision.c => Float16.c} (100%)
create mode 100644 clang/test/CodeGen/SystemZ/fp16.c
create mode 100644 llvm/test/CodeGen/SystemZ/atomic-load-10.ll
create mode 100644 llvm/test/CodeGen/SystemZ/atomic-store-10.ll
create mode 100644 llvm/test/CodeGen/SystemZ/fp-half-vector.ll
create mode 100644 llvm/test/CodeGen/SystemZ/spill-half.mir
diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h
index e23f9960948fd4..ab9b0d0be4998b 100644
--- a/clang/lib/Basic/Targets/SystemZ.h
+++ b/clang/lib/Basic/Targets/SystemZ.h
@@ -94,12 +94,12 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
// True if the backend supports operations on the half LLVM IR type.
// By setting this to false, conversions will happen for _Float16 around
- // a statement by default with operations done in float. However, if
+ // a statement by default, with operations done in float. However, if
// -ffloat16-excess-precision=none is given, no conversions will be made
// and instead the backend will promote each half operation to float
// individually.
HasLegalHalfType = false;
- // Allow half arguments and return values.
+ // Allow half arguments and return values (__fp16).
HalfArgsAndReturns = true;
// Support _Float16.
HasFloat16 = true;
diff --git a/clang/lib/CodeGen/Targets/SystemZ.cpp b/clang/lib/CodeGen/Targets/SystemZ.cpp
index 5e313c999240af..84af39bba18889 100644
--- a/clang/lib/CodeGen/Targets/SystemZ.cpp
+++ b/clang/lib/CodeGen/Targets/SystemZ.cpp
@@ -185,7 +185,7 @@ bool SystemZABIInfo::isFPArgumentType(QualType Ty) const {
if (const BuiltinType *BT = Ty->getAs<BuiltinType>())
switch (BT->getKind()) {
-// case BuiltinType::Half: // __fp16 Support __fp16??
+ case BuiltinType::Half: // __fp16
case BuiltinType::Float16: // _Float16
case BuiltinType::Float:
case BuiltinType::Double:
diff --git a/clang/test/CodeGen/SystemZ/fexcess-precision.c b/clang/test/CodeGen/SystemZ/Float16.c
similarity index 100%
rename from clang/test/CodeGen/SystemZ/fexcess-precision.c
rename to clang/test/CodeGen/SystemZ/Float16.c
diff --git a/clang/test/CodeGen/SystemZ/fp16.c b/clang/test/CodeGen/SystemZ/fp16.c
new file mode 100644
index 00000000000000..52683424f6cc1d
--- /dev/null
+++ b/clang/test/CodeGen/SystemZ/fp16.c
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -triple s390x-linux-gnu -emit-llvm -o - %s \
+// RUN: | FileCheck %s
+
+__fp16 f(__fp16 a, __fp16 b, __fp16 c, __fp16 d) {
+ return a * b + c * d;
+}
+
+// CHECK-LABEL: define dso_local half @f(half noundef %a, half noundef %b, half noundef %c, half noundef %d) #0 {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: %a.addr = alloca half, align 2
+// CHECK-NEXT: %b.addr = alloca half, align 2
+// CHECK-NEXT: %c.addr = alloca half, align 2
+// CHECK-NEXT: %d.addr = alloca half, align 2
+// CHECK-NEXT: store half %a, ptr %a.addr, align 2
+// CHECK-NEXT: store half %b, ptr %b.addr, align 2
+// CHECK-NEXT: store half %c, ptr %c.addr, align 2
+// CHECK-NEXT: store half %d, ptr %d.addr, align 2
+// CHECK-NEXT: %0 = load half, ptr %a.addr, align 2
+// CHECK-NEXT: %conv = fpext half %0 to float
+// CHECK-NEXT: %1 = load half, ptr %b.addr, align 2
+// CHECK-NEXT: %conv1 = fpext half %1 to float
+// CHECK-NEXT: %mul = fmul float %conv, %conv1
+// CHECK-NEXT: %2 = load half, ptr %c.addr, align 2
+// CHECK-NEXT: %conv2 = fpext half %2 to float
+// CHECK-NEXT: %3 = load half, ptr %d.addr, align 2
+// CHECK-NEXT: %conv3 = fpext half %3 to float
+// CHECK-NEXT: %mul4 = fmul float %conv2, %conv3
+// CHECK-NEXT: %add = fadd float %mul, %mul4
+// CHECK-NEXT: %4 = fptrunc float %add to half
+// CHECK-NEXT: ret half %4
+// CHECK-NEXT: }
+
diff --git a/clang/test/CodeGen/SystemZ/systemz-abi.c b/clang/test/CodeGen/SystemZ/systemz-abi.c
index 2287126bdeabec..07c914f1fa0019 100644
--- a/clang/test/CodeGen/SystemZ/systemz-abi.c
+++ b/clang/test/CodeGen/SystemZ/systemz-abi.c
@@ -45,6 +45,9 @@ long long pass_longlong(long long arg) { return arg; }
__int128 pass_int128(__int128 arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_int128(ptr dead_on_unwind noalias writable sret(i128) align 8 %{{.*}}, ptr %0)
+__fp16 pass___fp16(__fp16 arg) { return arg; }
+// CHECK-LABEL: define{{.*}} half @pass___fp16(half %{{.*}})
+
_Float16 pass__Float16(_Float16 arg) { return arg; }
// CHECK-LABEL: define{{.*}} half @pass__Float16(half %{{.*}})
@@ -75,6 +78,8 @@ _Complex long pass_complex_long(_Complex long arg) { return arg; }
_Complex long long pass_complex_longlong(_Complex long long arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_complex_longlong(ptr dead_on_unwind noalias writable sret({ i64, i64 }) align 8 %{{.*}}, ptr %{{.*}}arg)
+// _Complex __fp16 is (currently?) not allowed.
+
_Complex _Float16 pass_complex__Float16(_Complex _Float16 arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_complex__Float16(ptr dead_on_unwind noalias writable sret({ half, half }) align 2 %{{.*}}, ptr %{{.*}}arg)
@@ -129,6 +134,11 @@ struct agg_16byte pass_agg_16byte(struct agg_16byte arg) { return arg; }
// Float-like aggregate types
+struct agg___fp16 { __fp16 a; };
+struct agg___fp16 pass_agg___fp16(struct agg___fp16 arg) { return arg; }
+// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg___fp16(ptr dead_on_unwind noalias writable sret(%struct.agg___fp16) align 2 %{{.*}}, half %{{.*}})
+// SOFT-FLOAT-LABEL: define{{.*}} void @pass_agg___fp16(ptr dead_on_unwind noalias writable sret(%struct.agg___fp16) align 2 %{{.*}}, i16 noext %{{.*}})
+
struct agg__Float16 { _Float16 a; };
struct agg__Float16 pass_agg__Float16(struct agg__Float16 arg) { return arg; }
// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, half %{{.*}})
@@ -148,6 +158,11 @@ struct agg_longdouble { long double a; };
struct agg_longdouble pass_agg_longdouble(struct agg_longdouble arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_agg_longdouble(ptr dead_on_unwind noalias writable sret(%struct.agg_longdouble) align 8 %{{.*}}, ptr %{{.*}})
+struct agg___fp16_a8 { __fp16 a __attribute__((aligned (8))); };
+struct agg___fp16_a8 pass_agg___fp16_a8(struct agg___fp16_a8 arg) { return arg; }
+// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg___fp16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg___fp16_a8) align 8 %{{.*}}, double %{{.*}})
+// SOFT-FLOAT-LABEL: define{{.*}} void @pass_agg___fp16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg___fp16_a8) align 8 %{{.*}}, i64 %{{.*}})
+
struct agg__Float16_a8 { _Float16 a __attribute__((aligned (8))); };
struct agg__Float16_a8 pass_agg__Float16_a8(struct agg__Float16_a8 arg) { return arg; }
// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a8) align 8 %{{.*}}, double %{{.*}})
@@ -180,6 +195,10 @@ struct agg_nofloat3 pass_agg_nofloat3(struct agg_nofloat3 arg) { return arg; }
// Union types likewise are *not* float-like aggregate types
+union union___fp16 { __fp16 a; };
+union union___fp16 pass_union___fp16(union union___fp16 arg) { return arg; }
+// CHECK-LABEL: define{{.*}} void @pass_union___fp16(ptr dead_on_unwind noalias writable sret(%union.union___fp16) align 2 %{{.*}}, i16 noext %{{.*}})
+
union union__Float16 { _Float16 a; };
union union__Float16 pass_union__Float16(union union__Float16 arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_union__Float16(ptr dead_on_unwind noalias writable sret(%union.union__Float16) align 2 %{{.*}}, i16 noext %{{.*}})
@@ -461,6 +480,30 @@ struct agg_8byte va_agg_8byte(__builtin_va_list l) { return __builtin_va_arg(l,
// CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi ptr [ [[RAW_REG_ADDR]], %{{.*}} ], [ [[RAW_MEM_ADDR]], %{{.*}} ]
// CHECK: ret void
+struct agg___fp16 va_agg___fp16(__builtin_va_list l) { return __builtin_va_arg(l, struct agg___fp16); }
+// CHECK-LABEL: define{{.*}} void @va_agg___fp16(ptr dead_on_unwind noalias writable sret(%struct.agg___fp16) align 2 %{{.*}}, ptr %{{.*}}
+// HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 1
+// SOFT-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 0
+// CHECK: [[REG_COUNT:%[^ ]+]] = load i64, ptr [[REG_COUNT_PTR]]
+// HARD-FLOAT: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 4
+// SOFT-FLOAT: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5
+// CHECK: br i1 [[FITS_IN_REGS]],
+// CHECK: [[SCALED_REG_COUNT:%[^ ]+]] = mul i64 [[REG_COUNT]], 8
+// HARD-FLOAT: [[REG_OFFSET:%[^ ]+]] = add i64 [[SCALED_REG_COUNT]], 128
+// SOFT-FLOAT: [[REG_OFFSET:%[^ ]+]] = add i64 [[SCALED_REG_COUNT]], 22
+// CHECK: [[REG_SAVE_AREA_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 3
+// CHECK: [[REG_SAVE_AREA:%[^ ]+]] = load ptr, ptr [[REG_SAVE_AREA_PTR:[^ ]+]]
+// CHECK: [[RAW_REG_ADDR:%[^ ]+]] = getelementptr i8, ptr [[REG_SAVE_AREA]], i64 [[REG_OFFSET]]
+// CHECK: [[REG_COUNT1:%[^ ]+]] = add i64 [[REG_COUNT]], 1
+// CHECK: store i64 [[REG_COUNT1]], ptr [[REG_COUNT_PTR]]
+// CHECK: [[OVERFLOW_ARG_AREA_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 2
+// CHECK: [[OVERFLOW_ARG_AREA:%[^ ]+]] = load ptr, ptr [[OVERFLOW_ARG_AREA_PTR]]
+// CHECK: [[RAW_MEM_ADDR:%[^ ]+]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i64 6
+// CHECK: [[OVERFLOW_ARG_AREA2:%[^ ]+]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i64 8
+// CHECK: store ptr [[OVERFLOW_ARG_AREA2]], ptr [[OVERFLOW_ARG_AREA_PTR]]
+// CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi ptr [ [[RAW_REG_ADDR]], %{{.*}} ], [ [[RAW_MEM_ADDR]], %{{.*}} ]
+// CHECK: ret void
+
struct agg__Float16 va_agg__Float16(__builtin_va_list l) { return __builtin_va_arg(l, struct agg__Float16); }
// CHECK-LABEL: define{{.*}} void @va_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, ptr %{{.*}}
// HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 1
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 1055ee95c5d9e2..b0edea6ad8536f 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -523,7 +523,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setTruncStoreAction(VT, MVT::f16, Expand);
}
setOperationAction(ISD::LOAD, MVT::f16, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD, MVT::f16, Custom);
setOperationAction(ISD::STORE, MVT::f16, Custom);
+ setOperationAction(ISD::ATOMIC_STORE, MVT::f16, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
@@ -4596,6 +4598,22 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
return DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
}
+SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
+ SelectionDAG &DAG) const {
+ MVT RegVT = Op.getSimpleValueType();
+ if (RegVT.getSizeInBits() == 128)
+ return lowerATOMIC_LDST_I128(Op, DAG);
+ return lowerLoadF16(Op, DAG);
+}
+
+SDValue SystemZTargetLowering::lowerATOMIC_STORE(SDValue Op,
+ SelectionDAG &DAG) const {
+ auto *Node = cast<AtomicSDNode>(Op.getNode());
+ if (Node->getMemoryVT().getSizeInBits() == 128)
+ return lowerATOMIC_LDST_I128(Op, DAG);
+ return lowerStoreF16(Op, DAG);
+}
+
SDValue SystemZTargetLowering::lowerATOMIC_LDST_I128(SDValue Op,
SelectionDAG &DAG) const {
auto *Node = cast<AtomicSDNode>(Op.getNode());
@@ -6217,15 +6235,25 @@ SDValue SystemZTargetLowering::lowerLoadF16(SDValue Op,
MVT RegVT = Op.getSimpleValueType();
if (RegVT != MVT::f16)
return SDValue();
- LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
- SDLoc DL(Ld);
- assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending f16 load");
+
+ SDLoc DL(Op);
+ SDValue NewLd;
+ if (auto *AtomicLd = dyn_cast<AtomicSDNode>(Op.getNode())) {
+ assert(EVT(RegVT) == AtomicLd->getMemoryVT() && "Unhandled f16 load");
+ NewLd = DAG.getAtomic(ISD::ATOMIC_LOAD, DL, MVT::i16, MVT::i32,
+ AtomicLd->getChain(), AtomicLd->getBasePtr(),
+ AtomicLd->getMemOperand());
+ cast<AtomicSDNode>(NewLd)->setExtensionType(ISD::EXTLOAD);
+ } else {
+ LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
+ assert(EVT(RegVT) == Ld->getMemoryVT() && "Unhandled f16 load");
+ NewLd = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Ld->getChain(),
+ Ld->getBasePtr(), Ld->getPointerInfo(),
+ MVT::i16, Ld->getOriginalAlign(),
+ Ld->getMemOperand()->getFlags());
+ }
// Load as integer, shift and insert into upper 2 bytes of the FP register.
// TODO: Use VLEH if available.
- SDValue NewLd = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Ld->getChain(),
- Ld->getBasePtr(), Ld->getPointerInfo(),
- MVT::i16, Ld->getOriginalAlign(),
- Ld->getMemOperand()->getFlags());
SDValue Shft = DAG.getNode(ISD::SHL, DL, MVT::i32, NewLd,
DAG.getConstant(16, DL, MVT::i32));
SDValue BCast = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Shft);
@@ -6236,20 +6264,25 @@ SDValue SystemZTargetLowering::lowerLoadF16(SDValue Op,
SDValue SystemZTargetLowering::lowerStoreF16(SDValue Op,
SelectionDAG &DAG) const {
- StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
- SDLoc DL(St);
- SDValue StoredVal = St->getValue();
+ SDValue StoredVal = Op->getOperand(1);
MVT StoreVT = StoredVal.getSimpleValueType();
if (StoreVT != MVT::f16)
return SDValue();
- // Move into a GPR, shift and store the 2 bytes.
- // TODO: Use VSTEH if available.
+
+ // Move into a GPR, shift and store the 2 bytes. TODO: Use VSTEH if available.
+ SDLoc DL(Op);
SDNode *U32 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f32);
SDValue In32 = DAG.getTargetInsertSubreg(SystemZ::subreg_h16, DL,
MVT::f32, SDValue(U32, 0), StoredVal);
SDValue BCast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, In32);
SDValue Shft = DAG.getNode(ISD::SRL, DL, MVT::i32, BCast,
DAG.getConstant(16, DL, MVT::i32));
+
+ if (auto *AtomicSt = dyn_cast<AtomicSDNode>(Op.getNode()))
+ return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MVT::i16, AtomicSt->getChain(),
+ Shft, AtomicSt->getBasePtr(), AtomicSt->getMemOperand());
+
+ StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
return DAG.getTruncStore(St->getChain(), DL, Shft, St->getBasePtr(),
MVT::i16, St->getMemOperand());
}
@@ -6373,8 +6406,9 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
case ISD::ATOMIC_SWAP:
return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
case ISD::ATOMIC_STORE:
+ return lowerATOMIC_STORE(Op, DAG);
case ISD::ATOMIC_LOAD:
- return lowerATOMIC_LDST_I128(Op, DAG);
+ return lowerATOMIC_LOAD(Op, DAG);
case ISD::ATOMIC_LOAD_ADD:
return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD);
case ISD::ATOMIC_LOAD_SUB:
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 36c5a78dd8027f..d3f700561c29ac 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -698,6 +698,8 @@ class SystemZTargetLowering : public TargetLowering {
SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVECREDUCE_ADD(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerATOMIC_LDST_I128(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG,
unsigned Opcode) const;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 1c493afbe620eb..65cfc8cc7935bf 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -996,8 +996,31 @@ void SystemZInstrInfo::storeRegToStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
bool isKill, int FrameIdx, const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI, Register VReg) const {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+ // There are no fp16 load/store instructions, so need to save/restore via
+ // GPR (TODO: Use VSTEH in case of vector support).
+ if (RC == &SystemZ::FP16BitRegClass) {
+ assert(!MRI.isSSA() && MRI.getNumVirtRegs() &&
+ "Expected non-SSA form with virtual registers.");
+ Register GR64Reg = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
+ Register FP64Reg = MRI.createVirtualRegister(&SystemZ::FP64BitRegClass);
+ BuildMI(MBB, MBBI, DL, get(SystemZ::COPY))
+ .addReg(FP64Reg, RegState::DefineNoRead, SystemZ::subreg_h16)
+ .addReg(SrcReg, getKillRegState(isKill));
+ BuildMI(MBB, MBBI, DL, get(SystemZ::LGDR), GR64Reg)
+ .addReg(FP64Reg, RegState::Kill);
+ BuildMI(MBB, MBBI, DL, get(SystemZ::SRLG), GR64Reg)
+ .addReg(GR64Reg)
+ .addReg(0)
+ .addImm(48);
+ addFrameReference(BuildMI(MBB, MBBI, DL, get(SystemZ::STH))
+ .addReg(GR64Reg, RegState::Kill, SystemZ::subreg_l32),
+ FrameIdx);
+ return;
+ }
+
// Callers may expect a single instruction, so keep 128-bit moves
// together for now and lower them after register allocation.
unsigned LoadOpcode, StoreOpcode;
@@ -1013,8 +1036,31 @@ void SystemZInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI,
Register VReg) const {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+ // There are no fp16 load/store instructions, so need to save/restore via
+ // GPR (TODO: Use VLEH in case of vector support).
+ if (RC == &SystemZ::FP16BitRegClass) {
+ assert(!MRI.isSSA() && MRI.getNumVirtRegs() &&
+ "Expected non-SSA form with virtual registers.");
+ Register GR64Reg = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
+ Register FP64Reg = MRI.createVirtualRegister(&SystemZ::FP64BitRegClass);
+ addFrameReference(BuildMI(MBB, MBBI, DL, get(SystemZ::LH))
+ .addReg(GR64Reg, RegState::DefineNoRead,
+ SystemZ::subreg_l32),
+ FrameIdx);
+ BuildMI(MBB, MBBI, DL, get(SystemZ::SLLG), GR64Reg)
+ .addReg(GR64Reg)
+ .addReg(0)
+ .addImm(48);
+ BuildMI(MBB, MBBI, DL, get(SystemZ::LDGR), FP64Reg)
+ .addReg(GR64Reg, RegState::Kill);
+ BuildMI(MBB, MBBI, DL, get(SystemZ::COPY), DestReg)
+ .addReg(FP64Reg, RegState::Kill, SystemZ::subreg_h16);
+ return;
+ }
+
// Callers may expect a single instruction, so keep 128-bit moves
// together for now and lower them after register allocation.
unsigned LoadOpcode, StoreOpcode;
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-10.ll b/llvm/test/CodeGen/SystemZ/atomic-load-10.ll
new file mode 100644
index 00000000000000..077cae9c543281
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-10.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test fp16 atomic loads.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define half @f1(ptr %src) {
+; CHECK-LABEL: f1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lh %r0, 0(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f0, %r0
+; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; CHECK-NEXT: br %r14
+ %val = load atomic half, ptr %src seq_cst, align 2
+ ret half %val
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-10.ll b/llvm/test/CodeGen/SystemZ/atomic-store-10.ll
new file mode 100644
index 00000000000000..3fa5ce0a88bae5
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-10.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test half atomic stores.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define void @f1(ptr %src, half %val) {
+; CHECK-LABEL: f1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d
+; CHECK-NEXT: lgdr %r0, %f0
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 0(%r2)
+; CHECK-NEXT: bcr 15, %r0
+; CHECK-NEXT: br %r14
+ store atomic half %val, ptr %src seq_cst, align 2
+ ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector.ll
new file mode 100644
index 00000000000000..ed405ce82c1cd2
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-half-vector.ll
@@ -0,0 +1,1050 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=NOVEC
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=VECTOR
+
+; Add the <8 x half> argument with itself and return it.
+define <8 x half> @fun0(<8 x half> %Op) {
+; NOVEC-LABEL: fun0:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r13, %r15, 104(%r15)
+; NOVEC-NEXT: .cfi_offset %r13, -56
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -224
+; NOVEC-NEXT: .cfi_def_cfa_offset 384
+; NOVEC-NEXT: std %f8, 216(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 208(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f10, 200(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f11, 192(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f12, 184(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f13, 176(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f14, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f15, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: .cfi_offset %f10, -184
+; NOVEC-NEXT: .cfi_offset %f11, -192
+; NOVEC-NEXT: .cfi_offset %f12, -200
+; NOVEC-NEXT: .cfi_offset %f13, -208
+; NOVEC-NEXT: .cfi_offset %f14, -216
+; NOVEC-NEXT: .cfi_offset %f15, -224
+; NOVEC-NEXT: lh %r0, 414(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f15, %r0
+; NOVEC-NEXT: lh %r0, 406(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f12, %r0
+; NOVEC-NEXT: lh %r0, 398(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f9, %r0
+; NOVEC-NEXT: lh %r0, 390(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ler %f10, %f6
+; NOVEC-NEXT: ler %f11, %f4
+; NOVEC-NEXT: ler %f13, %f2
+; NOVEC-NEXT: ler %f14, %f0
+; NOVEC-NEXT: lgr %r13, %r2
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f8, %f0
+; NOVEC-NEXT: ler %f0, %f9
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f12
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f12, %f0
+; NOVEC-NEXT: ler %f0, %f15
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f15, %f0
+; NOVEC-NEXT: ler %f0, %f14
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f14, %f0
+; NOVEC-NEXT: ler %f0, %f13
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f13, %f0
+; NOVEC-NEXT: ler %f0, %f11
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f11, %f0
+; NOVEC-NEXT: ler %f0, %f10
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 6(%r13)
+; NOVEC-NEXT: lgdr %r0, %f11
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 4(%r13)
+; NOVEC-NEXT: lgdr %r0, %f13
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 2(%r13)
+; NOVEC-NEXT: lgdr %r0, %f14
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r13)
+; NOVEC-NEXT: lgdr %r0, %f15
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 14(%r13)
+; NOVEC-NEXT: lgdr %r0, %f12
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 12(%r13)
+; NOVEC-NEXT: lgdr %r0, %f9
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 10(%r13)
+; NOVEC-NEXT: lgdr %r0, %f8
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 8(%r13)
+; NOVEC-NEXT: ld %f8, 216(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 208(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f10, 200(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f11, 192(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f12, 184(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f13, 176(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f14, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f15, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r13, %r15, 328(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun0:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -224
+; VECTOR-NEXT: .cfi_def_cfa_offset 384
+; VECTOR-NEXT: std %f8, 216(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 208(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f10, 200(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f11, 192(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f12, 184(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f13, 176(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f14, 168(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f15, 160(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: .cfi_offset %f10, -184
+; VECTOR-NEXT: .cfi_offset %f11, -192
+; VECTOR-NEXT: .cfi_offset %f12, -200
+; VECTOR-NEXT: .cfi_offset %f13, -208
+; VECTOR-NEXT: .cfi_offset %f14, -216
+; VECTOR-NEXT: .cfi_offset %f15, -224
+; VECTOR-NEXT: lh %r0, 414(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v14, %r0, 0
+; VECTOR-NEXT: lh %r0, 406(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v12, %r0, 0
+; VECTOR-NEXT: lh %r0, 398(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v9, %r0, 0
+; VECTOR-NEXT: lh %r0, 390(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: ldr %f10, %f6
+; VECTOR-NEXT: ldr %f11, %f4
+; VECTOR-NEXT: ldr %f13, %f2
+; VECTOR-NEXT: lgr %r13, %r2
+; VECTOR-NEXT: ldr %f15, %f0
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $f0s
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: ldr %f0, %f9
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f12
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f12, %f0
+; VECTOR-NEXT: ldr %f0, %f14
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f14, %f0
+; VECTOR-NEXT: ldr %f0, %f15
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f15, %f0
+; VECTOR-NEXT: ldr %f0, %f13
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f13, %f0
+; VECTOR-NEXT: ldr %f0, %f11
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f11, %f0
+; VECTOR-NEXT: ldr %f0, %f10
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $f0s
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 6(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v11, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 4(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v13, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 2(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v15, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 0(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v14, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 14(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v12, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 12(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v9, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 10(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v8, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 8(%r13)
+; VECTOR-NEXT: ld %f8, 216(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 208(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f10, 200(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f11, 192(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f12, 184(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f13, 176(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f14, 168(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f15, 160(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r13, %r15, 328(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %Res = fadd <8 x half> %Op, %Op
+ ret <8 x half> %Res
+}
+
+; Same, but with partial vector values.
+define <4 x half> @fun1(<4 x half> %Op) {
+; NOVEC-LABEL: fun1:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -192
+; NOVEC-NEXT: .cfi_def_cfa_offset 352
+; NOVEC-NEXT: std %f8, 184(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 176(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f10, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f11, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: .cfi_offset %f10, -184
+; NOVEC-NEXT: .cfi_offset %f11, -192
+; NOVEC-NEXT: ler %f8, %f6
+; NOVEC-NEXT: ler %f9, %f4
+; NOVEC-NEXT: ler %f10, %f2
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f11, %f0
+; NOVEC-NEXT: ler %f0, %f10
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f10, %f0
+; NOVEC-NEXT: ler %f0, %f9
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f6, %f0
+; NOVEC-NEXT: ler %f0, %f11
+; NOVEC-NEXT: ler %f2, %f10
+; NOVEC-NEXT: ler %f4, %f9
+; NOVEC-NEXT: ld %f8, 184(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 176(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f10, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f11, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 304(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun1:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -192
+; VECTOR-NEXT: .cfi_def_cfa_offset 352
+; VECTOR-NEXT: std %f8, 184(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 176(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f10, 168(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f11, 160(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: .cfi_offset %f10, -184
+; VECTOR-NEXT: .cfi_offset %f11, -192
+; VECTOR-NEXT: ldr %f8, %f6
+; VECTOR-NEXT: ldr %f9, %f4
+; VECTOR-NEXT: ldr %f10, %f2
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f11, %f0
+; VECTOR-NEXT: ldr %f0, %f10
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f10, %f0
+; VECTOR-NEXT: ldr %f0, %f9
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f6, %f0
+; VECTOR-NEXT: ldr %f0, %f11
+; VECTOR-NEXT: ldr %f2, %f10
+; VECTOR-NEXT: ldr %f4, %f9
+; VECTOR-NEXT: ld %f8, 184(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 176(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f10, 168(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f11, 160(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 304(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %Res = fadd <4 x half> %Op, %Op
+ ret <4 x half> %Res
+}
+
+; Test a vector extension.
+define <2 x half> @fun2(<2 x half> %Op) {
+; NOVEC-LABEL: fun2:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -184
+; NOVEC-NEXT: .cfi_def_cfa_offset 344
+; NOVEC-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f10, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: .cfi_offset %f10, -184
+; NOVEC-NEXT: ler %f8, %f2
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ldebr %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: adbr %f9, %f9
+; NOVEC-NEXT: ldebr %f10, %f0
+; NOVEC-NEXT: ledbr %f0, %f9
+; NOVEC-NEXT: adbr %f10, %f10
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f8, %f0
+; NOVEC-NEXT: ledbr %f0, %f10
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f2, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 296(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun2:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -184
+; VECTOR-NEXT: .cfi_def_cfa_offset 344
+; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: ldr %f0, %f2
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldebr %f0, %f0
+; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldebr %f0, %f0
+; VECTOR-NEXT: vl %v1, 160(%r15), 3 # 16-byte Folded Reload
+; VECTOR-NEXT: vmrhg %v0, %v0, %v1
+; VECTOR-NEXT: vfadb %v0, %v0, %v0
+; VECTOR-NEXT: vledb %v0, %v0, 0, 0
+; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill
+; VECTOR-NEXT: # kill: def $f0s killed $f0s killed $v0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Folded Reload
+; VECTOR-NEXT: vrepf %v0, %v0, 2
+; VECTOR-NEXT: # kill: def $f0s killed $f0s killed $v0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f2, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 296(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %E = fpext <2 x half> %Op to <2 x double>
+ %Add = fadd <2 x double> %E, %E
+ %Res = fptrunc <2 x double> %Add to <2 x half>
+ ret <2 x half> %Res
+}
+
+; Load and store an <8 x half> vector.
+define void @fun3(ptr %Src, ptr %Dst) {
+; NOVEC-LABEL: fun3:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: lh %r0, 0(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: lh %r0, 2(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f1, %r0
+; NOVEC-NEXT: lh %r0, 4(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f2, %r0
+; NOVEC-NEXT: lh %r0, 6(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f3, %r0
+; NOVEC-NEXT: lh %r0, 8(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f4, %r0
+; NOVEC-NEXT: lh %r0, 10(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f5, %r0
+; NOVEC-NEXT: lh %r0, 12(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f6, %r0
+; NOVEC-NEXT: lh %r0, 14(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f7, %r0
+; NOVEC-NEXT: lgdr %r0, %f7
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 14(%r3)
+; NOVEC-NEXT: lgdr %r0, %f6
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 12(%r3)
+; NOVEC-NEXT: lgdr %r0, %f5
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 10(%r3)
+; NOVEC-NEXT: lgdr %r0, %f4
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 8(%r3)
+; NOVEC-NEXT: lgdr %r0, %f3
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 6(%r3)
+; NOVEC-NEXT: lgdr %r0, %f2
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 4(%r3)
+; NOVEC-NEXT: lgdr %r0, %f1
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 2(%r3)
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r3)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun3:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: lh %r0, 0(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: lh %r0, 2(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v1, %r0, 0
+; VECTOR-NEXT: lh %r0, 4(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v2, %r0, 0
+; VECTOR-NEXT: lh %r0, 6(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v3, %r0, 0
+; VECTOR-NEXT: lh %r0, 8(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v4, %r0, 0
+; VECTOR-NEXT: lh %r0, 10(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v5, %r0, 0
+; VECTOR-NEXT: lh %r0, 12(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v6, %r0, 0
+; VECTOR-NEXT: lh %r0, 14(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v7, %r0, 0
+; VECTOR-NEXT: vlgvf %r0, %v7, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 14(%r3)
+; VECTOR-NEXT: vlgvf %r0, %v6, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 12(%r3)
+; VECTOR-NEXT: vlgvf %r0, %v5, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 10(%r3)
+; VECTOR-NEXT: vlgvf %r0, %v4, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 8(%r3)
+; VECTOR-NEXT: vlgvf %r0, %v3, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 6(%r3)
+; VECTOR-NEXT: vlgvf %r0, %v2, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 4(%r3)
+; VECTOR-NEXT: vlgvf %r0, %v1, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 2(%r3)
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 0(%r3)
+; VECTOR-NEXT: br %r14
+entry:
+ %L = load <8 x half>, ptr %Src
+ store <8 x half> %L, ptr %Dst
+ ret void
+}
+
+; Call a function with <8 x half> argument and return values.
+declare <8 x half> @foo(<8 x half>)
+define void @fun4(ptr %Src, ptr %Dst) {
+; NOVEC-LABEL: fun4:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r13, %r15, 104(%r15)
+; NOVEC-NEXT: .cfi_offset %r13, -56
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -208
+; NOVEC-NEXT: .cfi_def_cfa_offset 368
+; NOVEC-NEXT: lh %r0, 0(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: lh %r0, 2(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f2, %r0
+; NOVEC-NEXT: # kill: def $f2h killed $f2h killed $f2d
+; NOVEC-NEXT: lh %r0, 4(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f4, %r0
+; NOVEC-NEXT: # kill: def $f4h killed $f4h killed $f4d
+; NOVEC-NEXT: lh %r0, 6(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f6, %r0
+; NOVEC-NEXT: # kill: def $f6h killed $f6h killed $f6d
+; NOVEC-NEXT: lh %r0, 8(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f1, %r0
+; NOVEC-NEXT: lh %r0, 10(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f3, %r0
+; NOVEC-NEXT: lh %r0, 12(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f5, %r0
+; NOVEC-NEXT: lh %r0, 14(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f7, %r0
+; NOVEC-NEXT: lgdr %r0, %f7
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 190(%r15)
+; NOVEC-NEXT: lgdr %r0, %f5
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 182(%r15)
+; NOVEC-NEXT: lgdr %r0, %f3
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 174(%r15)
+; NOVEC-NEXT: lgdr %r0, %f1
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: la %r2, 192(%r15)
+; NOVEC-NEXT: lgr %r13, %r3
+; NOVEC-NEXT: sth %r0, 166(%r15)
+; NOVEC-NEXT: brasl %r14, foo at PLT
+; NOVEC-NEXT: lh %r0, 192(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: lh %r0, 194(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f1, %r0
+; NOVEC-NEXT: lh %r0, 196(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f2, %r0
+; NOVEC-NEXT: lh %r0, 198(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f3, %r0
+; NOVEC-NEXT: lh %r0, 200(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f4, %r0
+; NOVEC-NEXT: lh %r0, 202(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f5, %r0
+; NOVEC-NEXT: lh %r0, 204(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f6, %r0
+; NOVEC-NEXT: lh %r0, 206(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f7, %r0
+; NOVEC-NEXT: lgdr %r0, %f7
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 14(%r13)
+; NOVEC-NEXT: lgdr %r0, %f6
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 12(%r13)
+; NOVEC-NEXT: lgdr %r0, %f5
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 10(%r13)
+; NOVEC-NEXT: lgdr %r0, %f4
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 8(%r13)
+; NOVEC-NEXT: lgdr %r0, %f3
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 6(%r13)
+; NOVEC-NEXT: lgdr %r0, %f2
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 4(%r13)
+; NOVEC-NEXT: lgdr %r0, %f1
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 2(%r13)
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r13)
+; NOVEC-NEXT: lmg %r13, %r15, 312(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun4:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -208
+; VECTOR-NEXT: .cfi_def_cfa_offset 368
+; VECTOR-NEXT: lh %r0, 0(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: lh %r0, 2(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v2, %r0, 0
+; VECTOR-NEXT: lh %r0, 4(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v4, %r0, 0
+; VECTOR-NEXT: lh %r0, 6(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v6, %r0, 0
+; VECTOR-NEXT: lh %r0, 8(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v1, %r0, 0
+; VECTOR-NEXT: lh %r0, 10(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v3, %r0, 0
+; VECTOR-NEXT: lh %r0, 12(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v5, %r0, 0
+; VECTOR-NEXT: lh %r0, 14(%r2)
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $f0s
+; VECTOR-NEXT: # kill: def $f2h killed $f2h killed $f2s
+; VECTOR-NEXT: # kill: def $f4h killed $f4h killed $f4s
+; VECTOR-NEXT: # kill: def $f6h killed $f6h killed $f6s
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v7, %r0, 0
+; VECTOR-NEXT: vlgvf %r0, %v7, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 190(%r15)
+; VECTOR-NEXT: vlgvf %r0, %v5, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 182(%r15)
+; VECTOR-NEXT: vlgvf %r0, %v3, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 174(%r15)
+; VECTOR-NEXT: vlgvf %r0, %v1, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 166(%r15)
+; VECTOR-NEXT: la %r2, 192(%r15)
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: brasl %r14, foo at PLT
+; VECTOR-NEXT: lh %r0, 192(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: lh %r0, 194(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v1, %r0, 0
+; VECTOR-NEXT: lh %r0, 196(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v2, %r0, 0
+; VECTOR-NEXT: lh %r0, 198(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v3, %r0, 0
+; VECTOR-NEXT: lh %r0, 200(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v4, %r0, 0
+; VECTOR-NEXT: lh %r0, 202(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v5, %r0, 0
+; VECTOR-NEXT: lh %r0, 204(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v6, %r0, 0
+; VECTOR-NEXT: lh %r0, 206(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v7, %r0, 0
+; VECTOR-NEXT: vlgvf %r0, %v7, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 14(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v6, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 12(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v5, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 10(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v4, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 8(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v3, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 6(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v2, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 4(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v1, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 2(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 0(%r13)
+; VECTOR-NEXT: lmg %r13, %r15, 312(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %arg = load <8 x half>, ptr %Src
+ %Res = call <8 x half> @foo(<8 x half> %arg)
+ store <8 x half> %Res, ptr %Dst
+ ret void
+}
+
+; Receive and pass argument fully on stack.
+declare void @foo2(<4 x half> %dummy, <8 x half> %Arg5)
+define void @fun5(<4 x half> %dummy, <8 x half> %Arg5) {
+; NOVEC-LABEL: fun5:
+; NOVEC: # %bb.0:
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -256
+; NOVEC-NEXT: .cfi_def_cfa_offset 416
+; NOVEC-NEXT: std %f8, 248(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 240(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f10, 232(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f11, 224(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: .cfi_offset %f10, -184
+; NOVEC-NEXT: .cfi_offset %f11, -192
+; NOVEC-NEXT: lh %r0, 422(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f1, %r0
+; NOVEC-NEXT: lh %r0, 430(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f3, %r0
+; NOVEC-NEXT: lh %r0, 438(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f5, %r0
+; NOVEC-NEXT: lh %r0, 446(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f7, %r0
+; NOVEC-NEXT: lh %r0, 454(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f8, %r0
+; NOVEC-NEXT: lh %r0, 462(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f9, %r0
+; NOVEC-NEXT: lh %r0, 470(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f10, %r0
+; NOVEC-NEXT: lh %r0, 478(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f11, %r0
+; NOVEC-NEXT: lgdr %r0, %f11
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 222(%r15)
+; NOVEC-NEXT: lgdr %r0, %f10
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 214(%r15)
+; NOVEC-NEXT: lgdr %r0, %f9
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 206(%r15)
+; NOVEC-NEXT: lgdr %r0, %f8
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 198(%r15)
+; NOVEC-NEXT: lgdr %r0, %f7
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 190(%r15)
+; NOVEC-NEXT: lgdr %r0, %f5
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 182(%r15)
+; NOVEC-NEXT: lgdr %r0, %f3
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 174(%r15)
+; NOVEC-NEXT: lgdr %r0, %f1
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 166(%r15)
+; NOVEC-NEXT: brasl %r14, foo2 at PLT
+; NOVEC-NEXT: ld %f8, 248(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 240(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f10, 232(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f11, 224(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 368(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun5:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -256
+; VECTOR-NEXT: .cfi_def_cfa_offset 416
+; VECTOR-NEXT: std %f8, 248(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 240(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f10, 232(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f11, 224(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: .cfi_offset %f10, -184
+; VECTOR-NEXT: .cfi_offset %f11, -192
+; VECTOR-NEXT: lh %r0, 422(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v1, %r0, 0
+; VECTOR-NEXT: lh %r0, 430(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v3, %r0, 0
+; VECTOR-NEXT: lh %r0, 438(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v5, %r0, 0
+; VECTOR-NEXT: lh %r0, 446(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v7, %r0, 0
+; VECTOR-NEXT: lh %r0, 454(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v8, %r0, 0
+; VECTOR-NEXT: lh %r0, 462(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v9, %r0, 0
+; VECTOR-NEXT: lh %r0, 470(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v10, %r0, 0
+; VECTOR-NEXT: lh %r0, 478(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v11, %r0, 0
+; VECTOR-NEXT: vlgvf %r0, %v11, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 222(%r15)
+; VECTOR-NEXT: vlgvf %r0, %v10, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 214(%r15)
+; VECTOR-NEXT: vlgvf %r0, %v9, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 206(%r15)
+; VECTOR-NEXT: vlgvf %r0, %v8, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 198(%r15)
+; VECTOR-NEXT: vlgvf %r0, %v7, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 190(%r15)
+; VECTOR-NEXT: vlgvf %r0, %v5, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 182(%r15)
+; VECTOR-NEXT: vlgvf %r0, %v3, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 174(%r15)
+; VECTOR-NEXT: vlgvf %r0, %v1, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 166(%r15)
+; VECTOR-NEXT: brasl %r14, foo2 at PLT
+; VECTOR-NEXT: ld %f8, 248(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 240(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f10, 232(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f11, 224(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 368(%r15)
+; VECTOR-NEXT: br %r14
+ call void @foo2(<4 x half> %dummy, <8 x half> %Arg5)
+ ret void
+}
+
+; Test loading vector constants.
+declare void @foo3(<8 x half>, <8 x half>)
+define void @fun6() {
+; NOVEC-LABEL: fun6:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -256
+; NOVEC-NEXT: .cfi_def_cfa_offset 416
+; NOVEC-NEXT: lhrl %r0, .LCPI6_0
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 254(%r15)
+; NOVEC-NEXT: sth %r0, 246(%r15)
+; NOVEC-NEXT: sth %r0, 238(%r15)
+; NOVEC-NEXT: sth %r0, 230(%r15)
+; NOVEC-NEXT: sth %r0, 222(%r15)
+; NOVEC-NEXT: sth %r0, 214(%r15)
+; NOVEC-NEXT: sth %r0, 206(%r15)
+; NOVEC-NEXT: sth %r0, 198(%r15)
+; NOVEC-NEXT: lhrl %r0, .LCPI6_1
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: ler %f2, %f0
+; NOVEC-NEXT: ler %f4, %f0
+; NOVEC-NEXT: ler %f6, %f0
+; NOVEC-NEXT: sth %r0, 190(%r15)
+; NOVEC-NEXT: sth %r0, 182(%r15)
+; NOVEC-NEXT: sth %r0, 174(%r15)
+; NOVEC-NEXT: sth %r0, 166(%r15)
+; NOVEC-NEXT: brasl %r14, foo3 at PLT
+; NOVEC-NEXT: lmg %r14, %r15, 368(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun6:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -256
+; VECTOR-NEXT: .cfi_def_cfa_offset 416
+; VECTOR-NEXT: lhrl %r0, .LCPI6_0
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 254(%r15)
+; VECTOR-NEXT: sth %r0, 246(%r15)
+; VECTOR-NEXT: sth %r0, 238(%r15)
+; VECTOR-NEXT: sth %r0, 230(%r15)
+; VECTOR-NEXT: sth %r0, 222(%r15)
+; VECTOR-NEXT: sth %r0, 214(%r15)
+; VECTOR-NEXT: sth %r0, 206(%r15)
+; VECTOR-NEXT: sth %r0, 198(%r15)
+; VECTOR-NEXT: lhrl %r0, .LCPI6_1
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: ldr %f2, %f0
+; VECTOR-NEXT: ldr %f4, %f0
+; VECTOR-NEXT: ldr %f6, %f0
+; VECTOR-NEXT: sth %r0, 190(%r15)
+; VECTOR-NEXT: sth %r0, 182(%r15)
+; VECTOR-NEXT: sth %r0, 174(%r15)
+; VECTOR-NEXT: sth %r0, 166(%r15)
+; VECTOR-NEXT: brasl %r14, foo3 at PLT
+; VECTOR-NEXT: lmg %r14, %r15, 368(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ call void @foo3(<8 x half> <half 0.0, half 0.0, half 0.0, half 0.0,
+ half 0.0, half 0.0, half 0.0, half 0.0>,
+ <8 x half> <half 0.375, half 0.375, half 0.375, half 0.375,
+ half 0.375, half 0.375, half 0.375, half 0.375>)
+ ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-half.ll b/llvm/test/CodeGen/SystemZ/fp-half.ll
index cbc680f90d160f..c704efe2c26eb0 100644
--- a/llvm/test/CodeGen/SystemZ/fp-half.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-half.ll
@@ -6,7 +6,6 @@
;
; Tests for 16-bit floating point (half).
-
; Incoming half arguments added together and returned.
define half @fun0(half %Op0, half %Op1) {
; NOVEC-LABEL: fun0:
@@ -680,6 +679,6 @@ define void @fun11() {
; VECTOR-NEXT: lmg %r14, %r15, 272(%r15)
; VECTOR-NEXT: br %r14
entry:
- call half @foo2(half 0.0, half 1.0, half 0.375)
+ call void @foo2(half 0.0, half 1.0, half 0.375)
ret void
}
diff --git a/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll b/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
index 36b3ef67c9e75a..95114eab364e57 100644
--- a/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
@@ -3,7 +3,7 @@
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s --check-prefix=CHECK-OPT
; RUN: llc < %s -mtriple=s390x-linux-gnu -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK-OPT
-; Test f16 libcalls.
+; Test f16 libcall.
define half @f0(half %x) {
; CHECK-OPT-LABEL: f0:
; CHECK-OPT-NOT: brasl %r14, __extendhfsf2 at PLT
diff --git a/llvm/test/CodeGen/SystemZ/spill-half.mir b/llvm/test/CodeGen/SystemZ/spill-half.mir
new file mode 100644
index 00000000000000..07934103da6edc
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/spill-half.mir
@@ -0,0 +1,39 @@
+# RUN: llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z10 -verify-machineinstrs \
+# RUN: -start-before=greedy | FileCheck %s
+
+# Test spilling / reloading of an fp16bit virtual register.
+
+---
+name: fun0
+alignment: 16
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: fp16bit }
+liveins:
+ - { reg: '$f0h', virtual-reg: '%0' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0:
+ liveins: $f0h
+
+ ; CHECK-LABEL: fun0:
+ ; CHECK-NOT: $f0
+ ; CHECK: # kill: def $f0h killed $f0h killed $f0d def $f0d
+ ; CHECK-NEXT: lgdr %r0, %f0
+ ; CHECK-NEXT: srlg %r0, %r0, 48
+ ; CHECK-NEXT: sth %r0, 166(%r15) # 2-byte Folded Spill
+ ; CHECK-NEXT: #APP
+ ; CHECK-NEXT: #NO_APP
+ ; CHECK-NEXT: lh %r0, 166(%r15) # 2-byte Folded Reload
+ ; CHECK-NEXT: sllg %r0, %r0, 48
+ ; CHECK-NEXT: ldgr %f0, %r0
+ ; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+ ; CHECK-NOT: $f0
+
+ %0:fp16bit = COPY $f0h
+ INLINEASM &"", 1, 12, implicit-def dead early-clobber $r0d, 12, implicit-def dead early-clobber $r1d, 12, implicit-def dead early-clobber $r2d, 12, implicit-def dead early-clobber $r3d, 12, implicit-def dead early-clobber $r4d, 12, implicit-def dead early-clobber $r5d, 12, implicit-def dead early-clobber $r6d, 12, implicit-def dead early-clobber $r7d, 12, implicit-def dead early-clobber $r8d, 12, implicit-def dead early-clobber $r9d, 12, implicit-def dead early-clobber $r10d, 12, implicit-def dead early-clobber $r11d, 12, implicit-def dead early-clobber $r12d, 12, implicit-def dead early-clobber $r13d, 12, implicit-def dead early-clobber $r14d, 12, implicit-def dead early-clobber $f0d, 12, implicit-def dead early-clobber $f1d, 12, implicit-def dead early-clobber $f2d, 12, implicit-def dead early-clobber $f3d, 12, implicit-def dead early-clobber $f4d, 12, implicit-def dead early-clobber $f5d, 12, implicit-def dead early-clobber $f6d, 12, implicit-def dead early-clobber $f7d, 12, implicit-def dead early-clobber $f8d, 12, implicit-def dead early-clobber $f9d, 12, implicit-def dead early-clobber $f10d, 12, implicit-def dead early-clobber $f11d, 12, implicit-def dead early-clobber $f12d, 12, implicit-def dead early-clobber $f13d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f15d
+ $f0h = COPY %0
+ Return implicit $f0h
+...
>From f0f15efd08b90977960eb867fb3a9c6ea7bab07f Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Mon, 18 Nov 2024 14:33:48 -0600
Subject: [PATCH 5/6] strict f16 with tests.
---
.../Target/SystemZ/SystemZISelLowering.cpp | 24 +-
llvm/test/CodeGen/SystemZ/atomic-load-10.ll | 2 +-
llvm/test/CodeGen/SystemZ/atomic-store-10.ll | 2 +-
llvm/test/CodeGen/SystemZ/fp-half-strict.ll | 219 ++++++++++++++++++
llvm/test/CodeGen/SystemZ/fp-round-03.ll | 3 +-
llvm/test/CodeGen/SystemZ/fp-sincos-01.ll | 2 +-
6 files changed, 234 insertions(+), 18 deletions(-)
create mode 100644 llvm/test/CodeGen/SystemZ/fp-half-strict.ll
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index b0edea6ad8536f..c18657dbe71f86 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -521,15 +521,13 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
for (MVT VT : {MVT::f32, MVT::f64, MVT::f128}) {
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
setTruncStoreAction(VT, MVT::f16, Expand);
+ setOperationAction(ISD::FP_EXTEND, VT, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom);
}
- setOperationAction(ISD::LOAD, MVT::f16, Custom);
- setOperationAction(ISD::ATOMIC_LOAD, MVT::f16, Custom);
- setOperationAction(ISD::STORE, MVT::f16, Custom);
- setOperationAction(ISD::ATOMIC_STORE, MVT::f16, Custom);
- setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
- setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
- setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
- setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+ for (auto Op : {ISD::LOAD, ISD::ATOMIC_LOAD,
+ ISD::STORE, ISD::ATOMIC_STORE,
+ ISD::FP_ROUND, ISD::STRICT_FP_ROUND})
+ setOperationAction(Op, MVT::f16, Custom);
for (unsigned I = MVT::FIRST_FP_VALUETYPE;
I <= MVT::LAST_FP_VALUETYPE;
@@ -569,7 +567,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STRICT_FSQRT, VT, Legal);
setOperationAction(ISD::STRICT_FRINT, VT, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal);
- setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
if (Subtarget.hasFPExtension()) {
setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
@@ -6167,7 +6164,7 @@ SDValue SystemZTargetLowering::LowerFP_EXTEND(SDValue Op,
MVT VT = Op.getSimpleValueType();
MVT SVT = In.getSimpleValueType();
if (SVT != MVT::f16)
- return Op;
+ return Op; // Legal
SDLoc DL(Op);
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
@@ -6199,8 +6196,7 @@ SDValue SystemZTargetLowering::LowerFP_ROUND(SDValue Op,
SDValue In = Op.getOperand(IsStrict ? 1 : 0);
MVT VT = Op.getSimpleValueType();
MVT SVT = In.getSimpleValueType();
- if (VT != MVT::f16)
- return SDValue(); // XXX?
+ assert(VT == MVT::f16 && "Only rounding to f16 needs custom handling.");
SDLoc DL(Op);
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
@@ -6466,10 +6462,10 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
case ISD::ROTL:
return lowerShift(Op, DAG, SystemZISD::VROTL_BY_SCALAR);
case ISD::FP_EXTEND:
-//case ISD::STRICT_FP_EXTEND:
+ case ISD::STRICT_FP_EXTEND:
return LowerFP_EXTEND(Op, DAG);
case ISD::FP_ROUND:
-//case ISD::STRICT_FP_ROUND:
+ case ISD::STRICT_FP_ROUND:
return LowerFP_ROUND(Op, DAG);
case ISD::LOAD:
return lowerLoadF16(Op, DAG);
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-10.ll b/llvm/test/CodeGen/SystemZ/atomic-load-10.ll
index 077cae9c543281..84f4014887546f 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-load-10.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-10.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; Test fp16 atomic loads.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs | FileCheck %s
define half @f1(ptr %src) {
; CHECK-LABEL: f1:
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-10.ll b/llvm/test/CodeGen/SystemZ/atomic-store-10.ll
index 3fa5ce0a88bae5..cee5b0d56209b0 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-store-10.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-10.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; Test half atomic stores.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs | FileCheck %s
define void @f1(ptr %src, half %val) {
; CHECK-LABEL: f1:
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-strict.ll b/llvm/test/CodeGen/SystemZ/fp-half-strict.ll
new file mode 100644
index 00000000000000..7d8acfa40eb313
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-half-strict.ll
@@ -0,0 +1,219 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=NOVEC
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=VECTOR
+;
+; Tests for strict 16-bit floating point (half).
+
+declare half @llvm.experimental.constrained.fadd.f16(half, half, metadata, metadata)
+declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata)
+declare half @llvm.experimental.constrained.fmul.f16(half, half, metadata, metadata)
+
+; Test register addition.
+define half @fun0(half %f1, half %f2) #0 {
+; NOVEC-LABEL: fun0:
+; NOVEC: # %bb.0:
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -176
+; NOVEC-NEXT: .cfi_def_cfa_offset 336
+; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: ler %f8, %f0
+; NOVEC-NEXT: ler %f0, %f2
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f9
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 288(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun0:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -176
+; VECTOR-NEXT: .cfi_def_cfa_offset 336
+; VECTOR-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: ldr %f0, %f2
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f9
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 288(%r15)
+; VECTOR-NEXT: br %r14
+ %res = call half @llvm.experimental.constrained.fadd.f16(
+ half %f1, half %f2,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret half %res
+}
+
+; Test atomic memory accesses and extension/truncation inside a strictfp
+; function.
+define void @fun1(ptr %Src, ptr %Dst) #0 {
+; NOVEC-LABEL: fun1:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r13, %r15, 104(%r15)
+; NOVEC-NEXT: .cfi_offset %r13, -56
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -160
+; NOVEC-NEXT: .cfi_def_cfa_offset 320
+; NOVEC-NEXT: lh %r0, 0(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: lgr %r13, %r3
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ldebr %f0, %f0
+; NOVEC-NEXT: adbr %f0, %f0
+; NOVEC-NEXT: ledbr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r13)
+; NOVEC-NEXT: bcr 14, %r0
+; NOVEC-NEXT: lmg %r13, %r15, 264(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun1:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: lh %r0, 0(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $f0s
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldebr %f0, %f0
+; VECTOR-NEXT: adbr %f0, %f0
+; VECTOR-NEXT: ledbra %f0, 0, %f0, 0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $f0s
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 0(%r13)
+; VECTOR-NEXT: bcr 14, %r0
+; VECTOR-NEXT: lmg %r13, %r15, 264(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %Op0 = load atomic half, ptr %Src seq_cst, align 2
+ %E0 = fpext half %Op0 to double
+ %Add = call double @llvm.experimental.constrained.fadd.f64(
+ double %E0, double %E0,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ %Res = fptrunc double %Add to half
+ store atomic half %Res, ptr %Dst seq_cst, align 2
+ ret void
+}
+
+; Test a chain of half operations which should have each operation surrounded
+; by conversions to/from fp32 for proper emulation.
+define half @fun2(half %Op0, half %Op1, half %Op2) #0 {
+; NOVEC-LABEL: fun2:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -184
+; NOVEC-NEXT: .cfi_def_cfa_offset 344
+; NOVEC-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f10, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: .cfi_offset %f10, -184
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f2
+; NOVEC-NEXT: ler %f8, %f4
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ler %f10, %f0
+; NOVEC-NEXT: ler %f0, %f9
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: meebr %f0, %f10
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: meebr %f0, %f9
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 296(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun2:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -184
+; VECTOR-NEXT: .cfi_def_cfa_offset 344
+; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 168(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f10, 160(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: .cfi_offset %f10, -184
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f2
+; VECTOR-NEXT: ldr %f8, %f4
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f10, %f0
+; VECTOR-NEXT: ldr %f0, %f9
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: meebr %f0, %f10
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: wfmsb %f0, %f9, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 296(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %A0 = call half @llvm.experimental.constrained.fmul.f16(
+ half %Op0, half %Op1,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ %Res = call half @llvm.experimental.constrained.fmul.f16(
+ half %A0, half %Op2,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret half %Res
+}
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/SystemZ/fp-round-03.ll b/llvm/test/CodeGen/SystemZ/fp-round-03.ll
index 837a95705b5f85..e7a9c0fa6e87aa 100644
--- a/llvm/test/CodeGen/SystemZ/fp-round-03.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-round-03.ll
@@ -1,6 +1,7 @@
; Test rounding functions for z14 and above.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 -verify-machineinstrs \
+; RUN: | FileCheck %s
; Test that an f16 intrinsic can be lowered with promotion to float.
declare half @llvm.rint.f16(half %f)
diff --git a/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll b/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
index 95114eab364e57..a004a5f9a7bd12 100644
--- a/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
@@ -1,6 +1,6 @@
; Test that combined sin/cos library call is emitted when appropriate
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s --check-prefix=CHECK-OPT
+; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-OPT
; RUN: llc < %s -mtriple=s390x-linux-gnu -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK-OPT
; Test f16 libcall.
>From a128da76709e989c85df7095561469821913a215 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Tue, 19 Nov 2024 13:51:28 -0600
Subject: [PATCH 6/6] Review
---
clang/docs/LanguageExtensions.rst | 1 +
clang/lib/Basic/Targets/SystemZ.h | 2 -
clang/lib/CodeGen/Targets/SystemZ.cpp | 1 -
clang/test/CodeGen/SystemZ/fp16.c | 51 +--
clang/test/CodeGen/SystemZ/systemz-abi.c | 43 ---
llvm/lib/IR/RuntimeLibcalls.cpp | 7 +-
.../Target/SystemZ/SystemZISelLowering.cpp | 104 +-----
llvm/lib/Target/SystemZ/SystemZISelLowering.h | 3 +-
llvm/test/CodeGen/SystemZ/fp-half-libcall.ll | 312 ++++++++++++++++++
llvm/test/CodeGen/SystemZ/fp-half-strict.ll | 12 +-
llvm/test/CodeGen/SystemZ/fp-half-vector.ll | 58 ++--
llvm/test/CodeGen/SystemZ/fp-half.ll | 149 +++++----
llvm/test/CodeGen/SystemZ/fp-sincos-01.ll | 20 +-
13 files changed, 480 insertions(+), 283 deletions(-)
create mode 100644 llvm/test/CodeGen/SystemZ/fp-half-libcall.ll
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index ff8e841ee53a2b..967d9771e62f6c 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -865,6 +865,7 @@ to ``float``; see below for more information on this emulation.
* SPIR (natively)
* X86 (if SSE2 is available; natively if AVX512-FP16 is also available)
* RISC-V (natively if Zfh or Zhinx is available)
+ * SystemZ (emulated)
* ``__bf16`` is supported on the following targets (currently never natively):
diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h
index ab9b0d0be4998b..b4da2c9ce64754 100644
--- a/clang/lib/Basic/Targets/SystemZ.h
+++ b/clang/lib/Basic/Targets/SystemZ.h
@@ -99,8 +99,6 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
// and instead the backend will promote each half operation to float
// individually.
HasLegalHalfType = false;
- // Allow half arguments and return values (__fp16).
- HalfArgsAndReturns = true;
// Support _Float16.
HasFloat16 = true;
diff --git a/clang/lib/CodeGen/Targets/SystemZ.cpp b/clang/lib/CodeGen/Targets/SystemZ.cpp
index 84af39bba18889..021d764dbfd063 100644
--- a/clang/lib/CodeGen/Targets/SystemZ.cpp
+++ b/clang/lib/CodeGen/Targets/SystemZ.cpp
@@ -185,7 +185,6 @@ bool SystemZABIInfo::isFPArgumentType(QualType Ty) const {
if (const BuiltinType *BT = Ty->getAs<BuiltinType>())
switch (BT->getKind()) {
- case BuiltinType::Half: // __fp16
case BuiltinType::Float16: // _Float16
case BuiltinType::Float:
case BuiltinType::Double:
diff --git a/clang/test/CodeGen/SystemZ/fp16.c b/clang/test/CodeGen/SystemZ/fp16.c
index 52683424f6cc1d..430958b69a177b 100644
--- a/clang/test/CodeGen/SystemZ/fp16.c
+++ b/clang/test/CodeGen/SystemZ/fp16.c
@@ -1,32 +1,39 @@
// RUN: %clang_cc1 -triple s390x-linux-gnu -emit-llvm -o - %s \
// RUN: | FileCheck %s
-__fp16 f(__fp16 a, __fp16 b, __fp16 c, __fp16 d) {
- return a * b + c * d;
+void f(__fp16 *a, __fp16 *b, __fp16 *c, __fp16 *d, __fp16 *e) {
+ *e = (*a) * (*b) + (*c) * (*d);
}
-// CHECK-LABEL: define dso_local half @f(half noundef %a, half noundef %b, half noundef %c, half noundef %d) #0 {
+// CHECK-LABEL: define dso_local void @f(ptr noundef %a, ptr noundef %b, ptr noundef %c, ptr noundef %d, ptr noundef %e) #0 {
// CHECK-NEXT: entry:
-// CHECK-NEXT: %a.addr = alloca half, align 2
-// CHECK-NEXT: %b.addr = alloca half, align 2
-// CHECK-NEXT: %c.addr = alloca half, align 2
-// CHECK-NEXT: %d.addr = alloca half, align 2
-// CHECK-NEXT: store half %a, ptr %a.addr, align 2
-// CHECK-NEXT: store half %b, ptr %b.addr, align 2
-// CHECK-NEXT: store half %c, ptr %c.addr, align 2
-// CHECK-NEXT: store half %d, ptr %d.addr, align 2
-// CHECK-NEXT: %0 = load half, ptr %a.addr, align 2
-// CHECK-NEXT: %conv = fpext half %0 to float
-// CHECK-NEXT: %1 = load half, ptr %b.addr, align 2
-// CHECK-NEXT: %conv1 = fpext half %1 to float
+// CHECK-NEXT: %a.addr = alloca ptr, align 8
+// CHECK-NEXT: %b.addr = alloca ptr, align 8
+// CHECK-NEXT: %c.addr = alloca ptr, align 8
+// CHECK-NEXT: %d.addr = alloca ptr, align 8
+// CHECK-NEXT: %e.addr = alloca ptr, align 8
+// CHECK-NEXT: store ptr %a, ptr %a.addr, align 8
+// CHECK-NEXT: store ptr %b, ptr %b.addr, align 8
+// CHECK-NEXT: store ptr %c, ptr %c.addr, align 8
+// CHECK-NEXT: store ptr %d, ptr %d.addr, align 8
+// CHECK-NEXT: store ptr %e, ptr %e.addr, align 8
+// CHECK-NEXT: %0 = load ptr, ptr %a.addr, align 8
+// CHECK-NEXT: %1 = load half, ptr %0, align 2
+// CHECK-NEXT: %conv = fpext half %1 to float
+// CHECK-NEXT: %2 = load ptr, ptr %b.addr, align 8
+// CHECK-NEXT: %3 = load half, ptr %2, align 2
+// CHECK-NEXT: %conv1 = fpext half %3 to float
// CHECK-NEXT: %mul = fmul float %conv, %conv1
-// CHECK-NEXT: %2 = load half, ptr %c.addr, align 2
-// CHECK-NEXT: %conv2 = fpext half %2 to float
-// CHECK-NEXT: %3 = load half, ptr %d.addr, align 2
-// CHECK-NEXT: %conv3 = fpext half %3 to float
+// CHECK-NEXT: %4 = load ptr, ptr %c.addr, align 8
+// CHECK-NEXT: %5 = load half, ptr %4, align 2
+// CHECK-NEXT: %conv2 = fpext half %5 to float
+// CHECK-NEXT: %6 = load ptr, ptr %d.addr, align 8
+// CHECK-NEXT: %7 = load half, ptr %6, align 2
+// CHECK-NEXT: %conv3 = fpext half %7 to float
// CHECK-NEXT: %mul4 = fmul float %conv2, %conv3
// CHECK-NEXT: %add = fadd float %mul, %mul4
-// CHECK-NEXT: %4 = fptrunc float %add to half
-// CHECK-NEXT: ret half %4
+// CHECK-NEXT: %8 = fptrunc float %add to half
+// CHECK-NEXT: %9 = load ptr, ptr %e.addr, align 8
+// CHECK-NEXT: store half %8, ptr %9, align 2
+// CHECK-NEXT: ret void
// CHECK-NEXT: }
-
diff --git a/clang/test/CodeGen/SystemZ/systemz-abi.c b/clang/test/CodeGen/SystemZ/systemz-abi.c
index 07c914f1fa0019..2287126bdeabec 100644
--- a/clang/test/CodeGen/SystemZ/systemz-abi.c
+++ b/clang/test/CodeGen/SystemZ/systemz-abi.c
@@ -45,9 +45,6 @@ long long pass_longlong(long long arg) { return arg; }
__int128 pass_int128(__int128 arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_int128(ptr dead_on_unwind noalias writable sret(i128) align 8 %{{.*}}, ptr %0)
-__fp16 pass___fp16(__fp16 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} half @pass___fp16(half %{{.*}})
-
_Float16 pass__Float16(_Float16 arg) { return arg; }
// CHECK-LABEL: define{{.*}} half @pass__Float16(half %{{.*}})
@@ -78,8 +75,6 @@ _Complex long pass_complex_long(_Complex long arg) { return arg; }
_Complex long long pass_complex_longlong(_Complex long long arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_complex_longlong(ptr dead_on_unwind noalias writable sret({ i64, i64 }) align 8 %{{.*}}, ptr %{{.*}}arg)
-// _Complex __fp16 is (currently?) not allowed.
-
_Complex _Float16 pass_complex__Float16(_Complex _Float16 arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_complex__Float16(ptr dead_on_unwind noalias writable sret({ half, half }) align 2 %{{.*}}, ptr %{{.*}}arg)
@@ -134,11 +129,6 @@ struct agg_16byte pass_agg_16byte(struct agg_16byte arg) { return arg; }
// Float-like aggregate types
-struct agg___fp16 { __fp16 a; };
-struct agg___fp16 pass_agg___fp16(struct agg___fp16 arg) { return arg; }
-// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg___fp16(ptr dead_on_unwind noalias writable sret(%struct.agg___fp16) align 2 %{{.*}}, half %{{.*}})
-// SOFT-FLOAT-LABEL: define{{.*}} void @pass_agg___fp16(ptr dead_on_unwind noalias writable sret(%struct.agg___fp16) align 2 %{{.*}}, i16 noext %{{.*}})
-
struct agg__Float16 { _Float16 a; };
struct agg__Float16 pass_agg__Float16(struct agg__Float16 arg) { return arg; }
// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, half %{{.*}})
@@ -158,11 +148,6 @@ struct agg_longdouble { long double a; };
struct agg_longdouble pass_agg_longdouble(struct agg_longdouble arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_agg_longdouble(ptr dead_on_unwind noalias writable sret(%struct.agg_longdouble) align 8 %{{.*}}, ptr %{{.*}})
-struct agg___fp16_a8 { __fp16 a __attribute__((aligned (8))); };
-struct agg___fp16_a8 pass_agg___fp16_a8(struct agg___fp16_a8 arg) { return arg; }
-// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg___fp16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg___fp16_a8) align 8 %{{.*}}, double %{{.*}})
-// SOFT-FLOAT-LABEL: define{{.*}} void @pass_agg___fp16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg___fp16_a8) align 8 %{{.*}}, i64 %{{.*}})
-
struct agg__Float16_a8 { _Float16 a __attribute__((aligned (8))); };
struct agg__Float16_a8 pass_agg__Float16_a8(struct agg__Float16_a8 arg) { return arg; }
// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a8) align 8 %{{.*}}, double %{{.*}})
@@ -195,10 +180,6 @@ struct agg_nofloat3 pass_agg_nofloat3(struct agg_nofloat3 arg) { return arg; }
// Union types likewise are *not* float-like aggregate types
-union union___fp16 { __fp16 a; };
-union union___fp16 pass_union___fp16(union union___fp16 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_union___fp16(ptr dead_on_unwind noalias writable sret(%union.union___fp16) align 2 %{{.*}}, i16 noext %{{.*}})
-
union union__Float16 { _Float16 a; };
union union__Float16 pass_union__Float16(union union__Float16 arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_union__Float16(ptr dead_on_unwind noalias writable sret(%union.union__Float16) align 2 %{{.*}}, i16 noext %{{.*}})
@@ -480,30 +461,6 @@ struct agg_8byte va_agg_8byte(__builtin_va_list l) { return __builtin_va_arg(l,
// CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi ptr [ [[RAW_REG_ADDR]], %{{.*}} ], [ [[RAW_MEM_ADDR]], %{{.*}} ]
// CHECK: ret void
-struct agg___fp16 va_agg___fp16(__builtin_va_list l) { return __builtin_va_arg(l, struct agg___fp16); }
-// CHECK-LABEL: define{{.*}} void @va_agg___fp16(ptr dead_on_unwind noalias writable sret(%struct.agg___fp16) align 2 %{{.*}}, ptr %{{.*}}
-// HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 1
-// SOFT-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 0
-// CHECK: [[REG_COUNT:%[^ ]+]] = load i64, ptr [[REG_COUNT_PTR]]
-// HARD-FLOAT: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 4
-// SOFT-FLOAT: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5
-// CHECK: br i1 [[FITS_IN_REGS]],
-// CHECK: [[SCALED_REG_COUNT:%[^ ]+]] = mul i64 [[REG_COUNT]], 8
-// HARD-FLOAT: [[REG_OFFSET:%[^ ]+]] = add i64 [[SCALED_REG_COUNT]], 128
-// SOFT-FLOAT: [[REG_OFFSET:%[^ ]+]] = add i64 [[SCALED_REG_COUNT]], 22
-// CHECK: [[REG_SAVE_AREA_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 3
-// CHECK: [[REG_SAVE_AREA:%[^ ]+]] = load ptr, ptr [[REG_SAVE_AREA_PTR:[^ ]+]]
-// CHECK: [[RAW_REG_ADDR:%[^ ]+]] = getelementptr i8, ptr [[REG_SAVE_AREA]], i64 [[REG_OFFSET]]
-// CHECK: [[REG_COUNT1:%[^ ]+]] = add i64 [[REG_COUNT]], 1
-// CHECK: store i64 [[REG_COUNT1]], ptr [[REG_COUNT_PTR]]
-// CHECK: [[OVERFLOW_ARG_AREA_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 2
-// CHECK: [[OVERFLOW_ARG_AREA:%[^ ]+]] = load ptr, ptr [[OVERFLOW_ARG_AREA_PTR]]
-// CHECK: [[RAW_MEM_ADDR:%[^ ]+]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i64 6
-// CHECK: [[OVERFLOW_ARG_AREA2:%[^ ]+]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i64 8
-// CHECK: store ptr [[OVERFLOW_ARG_AREA2]], ptr [[OVERFLOW_ARG_AREA_PTR]]
-// CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi ptr [ [[RAW_REG_ADDR]], %{{.*}} ], [ [[RAW_MEM_ADDR]], %{{.*}} ]
-// CHECK: ret void
-
struct agg__Float16 va_agg__Float16(__builtin_va_list l) { return __builtin_va_arg(l, struct agg__Float16); }
// CHECK-LABEL: define{{.*}} void @va_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, ptr %{{.*}}
// HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 1
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 496a681f1cbbad..7004da809d9499 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -247,8 +247,6 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
if (!TT.isWasm()) {
// These libcalls are only available in compiler-rt, not libgcc.
if (TT.isArch32Bit()) {
- setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
- setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
setLibcallName(RTLIB::SHL_I128, nullptr);
setLibcallName(RTLIB::SRL_I128, nullptr);
setLibcallName(RTLIB::SRA_I128, nullptr);
@@ -257,4 +255,9 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
}
setLibcallName(RTLIB::MULO_I128, nullptr);
}
+
+ if (TT.isSystemZ()) {
+ setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
+ setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
+ }
}
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index c18657dbe71f86..c7500d5691f61c 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -524,25 +524,16 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_EXTEND, VT, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom);
}
- for (auto Op : {ISD::LOAD, ISD::ATOMIC_LOAD,
- ISD::STORE, ISD::ATOMIC_STORE,
- ISD::FP_ROUND, ISD::STRICT_FP_ROUND})
+ for (auto Op : {ISD::LOAD, ISD::ATOMIC_LOAD, ISD::STORE, ISD::ATOMIC_STORE})
setOperationAction(Op, MVT::f16, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::f16, LibCall);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, LibCall);
for (unsigned I = MVT::FIRST_FP_VALUETYPE;
I <= MVT::LAST_FP_VALUETYPE;
++I) {
MVT VT = MVT::SimpleValueType(I);
- if (isTypeLegal(VT)) {
- // No special instructions for these.
- setOperationAction(ISD::FSIN, VT, Expand);
- setOperationAction(ISD::FCOS, VT, Expand);
- setOperationAction(ISD::FSINCOS, VT, Expand);
- setOperationAction(ISD::FREM, VT, Expand);
- setOperationAction(ISD::FPOW, VT, Expand);
- if (VT == MVT::f16)
- continue;
-
+ if (isTypeLegal(VT) && VT != MVT::f16) {
// We can use FI for FRINT.
setOperationAction(ISD::FRINT, VT, Legal);
@@ -555,6 +546,13 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FROUND, VT, Legal);
}
+ // No special instructions for these.
+ setOperationAction(ISD::FSIN, VT, Expand);
+ setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
+ setOperationAction(ISD::FREM, VT, Expand);
+ setOperationAction(ISD::FPOW, VT, Expand);
+
// Special treatment.
setOperationAction(ISD::IS_FPCLASS, VT, Custom);
@@ -6157,80 +6155,18 @@ static SDValue lowerAddrSpaceCast(SDValue Op, SelectionDAG &DAG) {
return Op;
}
-SDValue SystemZTargetLowering::LowerFP_EXTEND(SDValue Op,
+SDValue SystemZTargetLowering::lowerFP_EXTEND(SDValue Op,
SelectionDAG &DAG) const {
- bool IsStrict = Op->isStrictFPOpcode();
- SDValue In = Op.getOperand(IsStrict ? 1 : 0);
- MVT VT = Op.getSimpleValueType();
- MVT SVT = In.getSimpleValueType();
- if (SVT != MVT::f16)
+ SDValue In = Op.getOperand(Op->isStrictFPOpcode() ? 1 : 0);
+ if (In.getSimpleValueType() != MVT::f16)
return Op; // Legal
-
- SDLoc DL(Op);
- SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
-
- // Need a libcall. XXX factor out (below)
- TargetLowering::CallLoweringInfo CLI(DAG);
- Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
- TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
- Entry.Node = In;
- Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
- Args.push_back(Entry);
- SDValue Callee = DAG.getExternalSymbol(
- getLibcallName(RTLIB::FPEXT_F16_F32), getPointerTy(DAG.getDataLayout()));
- CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
- CallingConv::C, EVT(MVT::f32).getTypeForEVT(*DAG.getContext()), Callee,
- std::move(Args));
- SDValue Res;
- std::tie(Res,Chain) = LowerCallTo(CLI);
- if (IsStrict)
- Res = DAG.getMergeValues({Res, Chain}, DL);
-
- return DAG.getNode(ISD::FP_EXTEND, DL, VT, Res);
-}
-
-SDValue SystemZTargetLowering::LowerFP_ROUND(SDValue Op,
- SelectionDAG &DAG) const {
- bool IsStrict = Op->isStrictFPOpcode();
- SDValue In = Op.getOperand(IsStrict ? 1 : 0);
- MVT VT = Op.getSimpleValueType();
- MVT SVT = In.getSimpleValueType();
- assert(VT == MVT::f16 && "Only rounding to f16 needs custom handling.");
-
- SDLoc DL(Op);
- SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
-
- if (SVT != MVT::f32) {
- SDValue Rnd = DAG.getIntPtrConstant(0, DL, /*isTarget=*/true);
- In = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, In, Rnd);
- }
-
- // We need a libcall.
- TargetLowering::CallLoweringInfo CLI(DAG);
- Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
- TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
- Entry.Node = In;
- Entry.Ty = EVT(MVT::f32).getTypeForEVT(*DAG.getContext());
- Args.push_back(Entry);
- SDValue Callee = DAG.getExternalSymbol(
- getLibcallName(RTLIB::FPROUND_F32_F16), getPointerTy(DAG.getDataLayout()));
- CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
- CallingConv::C, EVT(MVT::f16).getTypeForEVT(*DAG.getContext()), Callee,
- std::move(Args));
- SDValue Res;
- std::tie(Res, Chain) = LowerCallTo(CLI);
- if (IsStrict)
- Res = DAG.getMergeValues({Res, Chain}, DL);
- return Res;
+ return SDValue(); // Let legalizer emit the libcall.
}
SDValue SystemZTargetLowering::lowerLoadF16(SDValue Op,
SelectionDAG &DAG) const {
MVT RegVT = Op.getSimpleValueType();
- if (RegVT != MVT::f16)
- return SDValue();
+ assert(RegVT == MVT::f16 && "Expected to lower an f16 load.");
SDLoc DL(Op);
SDValue NewLd;
@@ -6262,8 +6198,7 @@ SDValue SystemZTargetLowering::lowerStoreF16(SDValue Op,
SelectionDAG &DAG) const {
SDValue StoredVal = Op->getOperand(1);
MVT StoreVT = StoredVal.getSimpleValueType();
- if (StoreVT != MVT::f16)
- return SDValue();
+ assert(StoreVT == MVT::f16 && "Expected to lower an f16 store.");
// Move into a GPR, shift and store the 2 bytes. TODO: Use VSTEH if available.
SDLoc DL(Op);
@@ -6463,10 +6398,7 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
return lowerShift(Op, DAG, SystemZISD::VROTL_BY_SCALAR);
case ISD::FP_EXTEND:
case ISD::STRICT_FP_EXTEND:
- return LowerFP_EXTEND(Op, DAG);
- case ISD::FP_ROUND:
- case ISD::STRICT_FP_ROUND:
- return LowerFP_ROUND(Op, DAG);
+ return lowerFP_EXTEND(Op, DAG);
case ISD::LOAD:
return lowerLoadF16(Op, DAG);
case ISD::STORE:
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index d3f700561c29ac..3f54563039a9ae 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -721,8 +721,7 @@ class SystemZTargetLowering : public TargetLowering {
SDValue lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
- SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerLoadF16(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerStoreF16(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-libcall.ll b/llvm/test/CodeGen/SystemZ/fp-half-libcall.ll
new file mode 100644
index 00000000000000..6e813a4a5094d7
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-half-libcall.ll
@@ -0,0 +1,312 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test that library calls are emitted for LLVM IR intrinsics
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define half @f1(half %x, i16 %y) {
+; CHECK-LABEL: f1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
+; CHECK-NEXT: .cfi_offset %r13, -56
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: lhr %r13, %r2
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: llgfr %r2, %r13
+; CHECK-NEXT: brasl %r14, __powisf2 at PLT
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: lmg %r13, %r15, 264(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call half @llvm.powi.f16.i16(half %x, i16 %y)
+ ret half %tmp
+}
+
+define half @f2(half %x, half %y) {
+; CHECK-LABEL: f2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -176
+; CHECK-NEXT: .cfi_def_cfa_offset 336
+; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: .cfi_offset %f9, -176
+; CHECK-NEXT: ler %f8, %f2
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f9, %f0
+; CHECK-NEXT: ler %f0, %f8
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f2, %f0
+; CHECK-NEXT: ler %f0, %f9
+; CHECK-NEXT: brasl %r14, powf at PLT
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: lmg %r14, %r15, 288(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call half @llvm.pow.f16(half %x, half %y)
+ ret half %tmp
+}
+
+define half @f3(half %x) {
+; CHECK-LABEL: f3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: brasl %r14, sinf at PLT
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: lmg %r14, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call half @llvm.sin.f16(half %x)
+ ret half %tmp
+}
+
+define half @f4(half %x) {
+; CHECK-LABEL: f4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: brasl %r14, cosf at PLT
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: lmg %r14, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call half @llvm.cos.f16(half %x)
+ ret half %tmp
+}
+
+define half @f5(half %x) {
+; CHECK-LABEL: f5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: brasl %r14, expf at PLT
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: lmg %r14, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call half @llvm.exp.f16(half %x)
+ ret half %tmp
+}
+
+define half @f6(half %x) {
+; CHECK-LABEL: f6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: brasl %r14, exp2f at PLT
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: lmg %r14, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call half @llvm.exp2.f16(half %x)
+ ret half %tmp
+}
+
+define half @f7(half %x) {
+; CHECK-LABEL: f7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: brasl %r14, logf at PLT
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: lmg %r14, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call half @llvm.log.f16(half %x)
+ ret half %tmp
+}
+
+define half @f8(half %x) {
+; CHECK-LABEL: f8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: brasl %r14, log2f at PLT
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: lmg %r14, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call half @llvm.log2.f16(half %x)
+ ret half %tmp
+}
+
+define half @f9(half %x) {
+; CHECK-LABEL: f9:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: brasl %r14, log10f at PLT
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: lmg %r14, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call half @llvm.log10.f16(half %x)
+ ret half %tmp
+}
+
+define half @f10(half %x, half %y) {
+; CHECK-LABEL: f10:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -176
+; CHECK-NEXT: .cfi_def_cfa_offset 336
+; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: .cfi_offset %f9, -176
+; CHECK-NEXT: ler %f8, %f2
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f9, %f0
+; CHECK-NEXT: ler %f0, %f8
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f2, %f0
+; CHECK-NEXT: ler %f0, %f9
+; CHECK-NEXT: brasl %r14, fminf at PLT
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: lmg %r14, %r15, 288(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call half @llvm.minnum.f16(half %x, half %y)
+ ret half %tmp
+}
+
+define half @f11(half %x, half %y) {
+; CHECK-LABEL: f11:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -176
+; CHECK-NEXT: .cfi_def_cfa_offset 336
+; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: .cfi_offset %f9, -176
+; CHECK-NEXT: ler %f8, %f2
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f9, %f0
+; CHECK-NEXT: ler %f0, %f8
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f2, %f0
+; CHECK-NEXT: ler %f0, %f9
+; CHECK-NEXT: brasl %r14, fmaxf at PLT
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: lmg %r14, %r15, 288(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call half @llvm.maxnum.f16(half %x, half %y)
+ ret half %tmp
+}
+
+; Verify that "nnan" minnum/maxnum calls are transformed to
+; compare+select sequences instead of libcalls.
+define half @f12(half %x, half %y) {
+; CHECK-LABEL: f12:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -176
+; CHECK-NEXT: .cfi_def_cfa_offset 336
+; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: .cfi_offset %f9, -176
+; CHECK-NEXT: ler %f9, %f0
+; CHECK-NEXT: ler %f0, %f2
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f8, %f0
+; CHECK-NEXT: ler %f0, %f9
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: cebr %f0, %f8
+; CHECK-NEXT: jl .LBB11_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ler %f0, %f8
+; CHECK-NEXT: .LBB11_2:
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: lmg %r14, %r15, 288(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call nnan half @llvm.minnum.f16(half %x, half %y)
+ ret half %tmp
+}
+
+define half @f13(half %x, half %y) {
+; CHECK-LABEL: f13:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -176
+; CHECK-NEXT: .cfi_def_cfa_offset 336
+; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: .cfi_offset %f9, -176
+; CHECK-NEXT: ler %f9, %f0
+; CHECK-NEXT: ler %f0, %f2
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: ler %f8, %f0
+; CHECK-NEXT: ler %f0, %f9
+; CHECK-NEXT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-NEXT: cebr %f0, %f8
+; CHECK-NEXT: jh .LBB12_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ler %f0, %f8
+; CHECK-NEXT: .LBB12_2:
+; CHECK-NEXT: brasl %r14, __truncsfhf2 at PLT
+; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: lmg %r14, %r15, 288(%r15)
+; CHECK-NEXT: br %r14
+ %tmp = call nnan half @llvm.maxnum.f16(half %x, half %y)
+ ret half %tmp
+}
+
+declare half @llvm.powi.f16.i16(half, i16)
+declare half @llvm.pow.f16(half, half)
+
+declare half @llvm.sin.f16(half)
+declare half @llvm.cos.f16(half)
+
+declare half @llvm.exp.f16(half)
+declare half @llvm.exp2.f16(half)
+
+declare half @llvm.log.f16(half)
+declare half @llvm.log2.f16(half)
+declare half @llvm.log10.f16(half)
+
+declare half @llvm.minnum.f16(half, half)
+declare half @llvm.maxnum.f16(half, half)
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-strict.ll b/llvm/test/CodeGen/SystemZ/fp-half-strict.ll
index 7d8acfa40eb313..f02b0c79536cf1 100644
--- a/llvm/test/CodeGen/SystemZ/fp-half-strict.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-half-strict.ll
@@ -83,11 +83,9 @@ define void @fun1(ptr %Src, ptr %Dst) #0 {
; NOVEC-NEXT: lgr %r13, %r3
; NOVEC-NEXT: ldgr %f0, %r0
; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
-; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
-; NOVEC-NEXT: ldebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __extendhfdf2 at PLT
; NOVEC-NEXT: adbr %f0, %f0
-; NOVEC-NEXT: ledbr %f0, %f0
-; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: brasl %r14, __truncdfhf2 at PLT
; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
; NOVEC-NEXT: lgdr %r0, %f0
; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
@@ -110,11 +108,9 @@ define void @fun1(ptr %Src, ptr %Dst) #0 {
; VECTOR-NEXT: lgr %r13, %r3
; VECTOR-NEXT: vlvgf %v0, %r0, 0
; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $f0s
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __extendhfdf2 at PLT
; VECTOR-NEXT: adbr %f0, %f0
-; VECTOR-NEXT: ledbra %f0, 0, %f0, 0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: brasl %r14, __truncdfhf2 at PLT
; VECTOR-NEXT: # kill: def $f0h killed $f0h def $f0s
; VECTOR-NEXT: vlgvf %r0, %v0, 0
; VECTOR-NEXT: srl %r0, 16
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector.ll
index ed405ce82c1cd2..d6dd0f60e4235a 100644
--- a/llvm/test/CodeGen/SystemZ/fp-half-vector.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-half-vector.ll
@@ -361,33 +361,30 @@ define <2 x half> @fun2(<2 x half> %Op) {
; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
; NOVEC-NEXT: .cfi_offset %r14, -48
; NOVEC-NEXT: .cfi_offset %r15, -40
-; NOVEC-NEXT: aghi %r15, -184
-; NOVEC-NEXT: .cfi_def_cfa_offset 344
-; NOVEC-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill
-; NOVEC-NEXT: std %f9, 168(%r15) # 8-byte Folded Spill
-; NOVEC-NEXT: std %f10, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: aghi %r15, -176
+; NOVEC-NEXT: .cfi_def_cfa_offset 336
+; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
; NOVEC-NEXT: .cfi_offset %f8, -168
; NOVEC-NEXT: .cfi_offset %f9, -176
-; NOVEC-NEXT: .cfi_offset %f10, -184
; NOVEC-NEXT: ler %f8, %f2
-; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
-; NOVEC-NEXT: ldebr %f9, %f0
+; NOVEC-NEXT: brasl %r14, __extendhfdf2 at PLT
+; NOVEC-NEXT: ldr %f9, %f0
; NOVEC-NEXT: ler %f0, %f8
-; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: brasl %r14, __extendhfdf2 at PLT
; NOVEC-NEXT: adbr %f9, %f9
-; NOVEC-NEXT: ldebr %f10, %f0
-; NOVEC-NEXT: ledbr %f0, %f9
-; NOVEC-NEXT: adbr %f10, %f10
-; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
-; NOVEC-NEXT: ler %f8, %f0
-; NOVEC-NEXT: ledbr %f0, %f10
-; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ldr %f8, %f0
+; NOVEC-NEXT: adbr %f8, %f0
+; NOVEC-NEXT: ldr %f0, %f9
+; NOVEC-NEXT: brasl %r14, __truncdfhf2 at PLT
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ldr %f0, %f8
+; NOVEC-NEXT: brasl %r14, __truncdfhf2 at PLT
; NOVEC-NEXT: ler %f2, %f0
-; NOVEC-NEXT: ler %f0, %f8
-; NOVEC-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
-; NOVEC-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
-; NOVEC-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
-; NOVEC-NEXT: lmg %r14, %r15, 296(%r15)
+; NOVEC-NEXT: ler %f0, %f9
+; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 288(%r15)
; NOVEC-NEXT: br %r14
;
; VECTOR-LABEL: fun2:
@@ -401,24 +398,23 @@ define <2 x half> @fun2(<2 x half> %Op) {
; VECTOR-NEXT: .cfi_offset %f8, -168
; VECTOR-NEXT: ldr %f8, %f0
; VECTOR-NEXT: ldr %f0, %f2
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __extendhfdf2 at PLT
+; VECTOR-NEXT: # kill: def $f0d killed $f0d def $v0
; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill
; VECTOR-NEXT: ldr %f0, %f8
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __extendhfdf2 at PLT
; VECTOR-NEXT: vl %v1, 160(%r15), 3 # 16-byte Folded Reload
+; VECTOR-NEXT: # kill: def $f0d killed $f0d def $v0
; VECTOR-NEXT: vmrhg %v0, %v0, %v1
; VECTOR-NEXT: vfadb %v0, %v0, %v0
-; VECTOR-NEXT: vledb %v0, %v0, 0, 0
; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill
-; VECTOR-NEXT: # kill: def $f0s killed $f0s killed $v0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0
+; VECTOR-NEXT: brasl %r14, __truncdfhf2 at PLT
; VECTOR-NEXT: ldr %f8, %f0
; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Folded Reload
-; VECTOR-NEXT: vrepf %v0, %v0, 2
-; VECTOR-NEXT: # kill: def $f0s killed $f0s killed $v0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: vrepg %v0, %v0, 1
+; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0
+; VECTOR-NEXT: brasl %r14, __truncdfhf2 at PLT
; VECTOR-NEXT: ldr %f2, %f0
; VECTOR-NEXT: ldr %f0, %f8
; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/SystemZ/fp-half.ll b/llvm/test/CodeGen/SystemZ/fp-half.ll
index c704efe2c26eb0..4e6db003d2d136 100644
--- a/llvm/test/CodeGen/SystemZ/fp-half.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-half.ll
@@ -73,14 +73,12 @@ define half @fun1(half %Op0, half %Op1) {
; NOVEC-NEXT: .cfi_offset %f8, -168
; NOVEC-NEXT: .cfi_offset %f9, -176
; NOVEC-NEXT: ler %f8, %f2
-; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
-; NOVEC-NEXT: ldebr %f9, %f0
+; NOVEC-NEXT: brasl %r14, __extendhfdf2 at PLT
+; NOVEC-NEXT: ldr %f9, %f0
; NOVEC-NEXT: ler %f0, %f8
-; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
-; NOVEC-NEXT: ldebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __extendhfdf2 at PLT
; NOVEC-NEXT: adbr %f0, %f9
-; NOVEC-NEXT: ledbr %f0, %f0
-; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: brasl %r14, __truncdfhf2 at PLT
; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
; NOVEC-NEXT: lmg %r14, %r15, 288(%r15)
@@ -98,14 +96,12 @@ define half @fun1(half %Op0, half %Op1) {
; VECTOR-NEXT: .cfi_offset %f8, -168
; VECTOR-NEXT: .cfi_offset %f9, -176
; VECTOR-NEXT: ldr %f8, %f2
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldebr %f9, %f0
+; VECTOR-NEXT: brasl %r14, __extendhfdf2 at PLT
+; VECTOR-NEXT: ldr %f9, %f0
; VECTOR-NEXT: ldr %f0, %f8
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __extendhfdf2 at PLT
; VECTOR-NEXT: wfadb %f0, %f9, %f0
-; VECTOR-NEXT: ledbra %f0, 0, %f0, 0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: brasl %r14, __truncdfhf2 at PLT
; VECTOR-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
; VECTOR-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
; VECTOR-NEXT: lmg %r14, %r15, 288(%r15)
@@ -124,28 +120,33 @@ define half @fun2(half %Op0, half %Op1) {
; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
; NOVEC-NEXT: .cfi_offset %r14, -48
; NOVEC-NEXT: .cfi_offset %r15, -40
-; NOVEC-NEXT: aghi %r15, -184
-; NOVEC-NEXT: .cfi_def_cfa_offset 344
-; NOVEC-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill
-; NOVEC-NEXT: std %f9, 168(%r15) # 8-byte Folded Spill
-; NOVEC-NEXT: std %f11, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: aghi %r15, -232
+; NOVEC-NEXT: .cfi_def_cfa_offset 392
+; NOVEC-NEXT: std %f8, 224(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 216(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f11, 208(%r15) # 8-byte Folded Spill
; NOVEC-NEXT: .cfi_offset %f8, -168
; NOVEC-NEXT: .cfi_offset %f9, -176
; NOVEC-NEXT: .cfi_offset %f11, -184
+; NOVEC-NEXT: la %r2, 160(%r15)
; NOVEC-NEXT: ler %f8, %f2
-; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
-; NOVEC-NEXT: lxebr %f9, %f0
+; NOVEC-NEXT: brasl %r14, __extendhftf2 at PLT
+; NOVEC-NEXT: ld %f9, 160(%r15)
+; NOVEC-NEXT: ld %f11, 168(%r15)
+; NOVEC-NEXT: la %r2, 176(%r15)
; NOVEC-NEXT: ler %f0, %f8
-; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
-; NOVEC-NEXT: lxebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __extendhftf2 at PLT
+; NOVEC-NEXT: ld %f0, 176(%r15)
+; NOVEC-NEXT: ld %f2, 184(%r15)
+; NOVEC-NEXT: la %r2, 192(%r15)
; NOVEC-NEXT: axbr %f0, %f9
-; NOVEC-NEXT: lexbr %f0, %f0
-; NOVEC-NEXT: # kill: def $f0s killed $f0s killed $f0q
-; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
-; NOVEC-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
-; NOVEC-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
-; NOVEC-NEXT: ld %f11, 160(%r15) # 8-byte Folded Reload
-; NOVEC-NEXT: lmg %r14, %r15, 296(%r15)
+; NOVEC-NEXT: std %f0, 192(%r15)
+; NOVEC-NEXT: std %f2, 200(%r15)
+; NOVEC-NEXT: brasl %r14, __trunctfhf2 at PLT
+; NOVEC-NEXT: ld %f8, 224(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 216(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f11, 208(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 344(%r15)
; NOVEC-NEXT: br %r14
;
; VECTOR-LABEL: fun2:
@@ -153,26 +154,26 @@ define half @fun2(half %Op0, half %Op1) {
; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
; VECTOR-NEXT: .cfi_offset %r14, -48
; VECTOR-NEXT: .cfi_offset %r15, -40
-; VECTOR-NEXT: aghi %r15, -184
-; VECTOR-NEXT: .cfi_def_cfa_offset 344
-; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: aghi %r15, -232
+; VECTOR-NEXT: .cfi_def_cfa_offset 392
+; VECTOR-NEXT: std %f8, 224(%r15) # 8-byte Folded Spill
; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: la %r2, 176(%r15)
; VECTOR-NEXT: ldr %f8, %f2
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldebr %f0, %f0
-; VECTOR-NEXT: wflld %v0, %f0
+; VECTOR-NEXT: brasl %r14, __extendhftf2 at PLT
+; VECTOR-NEXT: vl %v0, 176(%r15), 3
; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill
+; VECTOR-NEXT: la %r2, 192(%r15)
; VECTOR-NEXT: ldr %f0, %f8
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldebr %f0, %f0
-; VECTOR-NEXT: wflld %v0, %f0
+; VECTOR-NEXT: brasl %r14, __extendhftf2 at PLT
+; VECTOR-NEXT: vl %v0, 192(%r15), 3
; VECTOR-NEXT: vl %v1, 160(%r15), 3 # 16-byte Folded Reload
; VECTOR-NEXT: wfaxb %v0, %v1, %v0
-; VECTOR-NEXT: wflrx %f0, %v0, 0, 3
-; VECTOR-NEXT: ledbra %f0, 0, %f0, 0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
-; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
-; VECTOR-NEXT: lmg %r14, %r15, 296(%r15)
+; VECTOR-NEXT: la %r2, 208(%r15)
+; VECTOR-NEXT: vst %v0, 208(%r15), 3
+; VECTOR-NEXT: brasl %r14, __trunctfhf2 at PLT
+; VECTOR-NEXT: ld %f8, 224(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 344(%r15)
; VECTOR-NEXT: br %r14
entry:
%E0 = fpext half %Op0 to fp128
@@ -253,11 +254,9 @@ define void @fun4(ptr %Src, ptr %Dst) {
; NOVEC-NEXT: lgr %r13, %r3
; NOVEC-NEXT: ldgr %f0, %r0
; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
-; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
-; NOVEC-NEXT: ldebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __extendhfdf2 at PLT
; NOVEC-NEXT: adbr %f0, %f0
-; NOVEC-NEXT: ledbr %f0, %f0
-; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: brasl %r14, __truncdfhf2 at PLT
; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
; NOVEC-NEXT: lgdr %r0, %f0
; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
@@ -279,11 +278,9 @@ define void @fun4(ptr %Src, ptr %Dst) {
; VECTOR-NEXT: lgr %r13, %r3
; VECTOR-NEXT: vlvgf %v0, %r0, 0
; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $f0s
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __extendhfdf2 at PLT
; VECTOR-NEXT: adbr %f0, %f0
-; VECTOR-NEXT: ledbra %f0, 0, %f0, 0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: brasl %r14, __truncdfhf2 at PLT
; VECTOR-NEXT: # kill: def $f0h killed $f0h def $f0s
; VECTOR-NEXT: vlgvf %r0, %v0, 0
; VECTOR-NEXT: srl %r0, 16
@@ -306,26 +303,29 @@ define void @fun5(ptr %Src, ptr %Dst) {
; NOVEC-NEXT: .cfi_offset %r13, -56
; NOVEC-NEXT: .cfi_offset %r14, -48
; NOVEC-NEXT: .cfi_offset %r15, -40
-; NOVEC-NEXT: aghi %r15, -160
-; NOVEC-NEXT: .cfi_def_cfa_offset 320
+; NOVEC-NEXT: aghi %r15, -192
+; NOVEC-NEXT: .cfi_def_cfa_offset 352
; NOVEC-NEXT: lh %r0, 0(%r2)
; NOVEC-NEXT: sll %r0, 16
; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: la %r2, 160(%r15)
; NOVEC-NEXT: lgr %r13, %r3
; NOVEC-NEXT: ldgr %f0, %r0
; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
-; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
-; NOVEC-NEXT: lxebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __extendhftf2 at PLT
+; NOVEC-NEXT: ld %f0, 160(%r15)
+; NOVEC-NEXT: ld %f2, 168(%r15)
+; NOVEC-NEXT: la %r2, 176(%r15)
; NOVEC-NEXT: axbr %f0, %f0
-; NOVEC-NEXT: lexbr %f0, %f0
-; NOVEC-NEXT: # kill: def $f0s killed $f0s killed $f0q
-; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: std %f0, 176(%r15)
+; NOVEC-NEXT: std %f2, 184(%r15)
+; NOVEC-NEXT: brasl %r14, __trunctfhf2 at PLT
; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
; NOVEC-NEXT: lgdr %r0, %f0
; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
; NOVEC-NEXT: srl %r0, 16
; NOVEC-NEXT: sth %r0, 0(%r13)
-; NOVEC-NEXT: lmg %r13, %r15, 264(%r15)
+; NOVEC-NEXT: lmg %r13, %r15, 296(%r15)
; NOVEC-NEXT: br %r14
;
; VECTOR-LABEL: fun5:
@@ -334,25 +334,25 @@ define void @fun5(ptr %Src, ptr %Dst) {
; VECTOR-NEXT: .cfi_offset %r13, -56
; VECTOR-NEXT: .cfi_offset %r14, -48
; VECTOR-NEXT: .cfi_offset %r15, -40
-; VECTOR-NEXT: aghi %r15, -160
-; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: aghi %r15, -192
+; VECTOR-NEXT: .cfi_def_cfa_offset 352
; VECTOR-NEXT: lh %r0, 0(%r2)
; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: la %r2, 160(%r15)
; VECTOR-NEXT: lgr %r13, %r3
; VECTOR-NEXT: vlvgf %v0, %r0, 0
; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $f0s
-; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
-; VECTOR-NEXT: ldebr %f0, %f0
-; VECTOR-NEXT: wflld %v0, %f0
+; VECTOR-NEXT: brasl %r14, __extendhftf2 at PLT
+; VECTOR-NEXT: vl %v0, 160(%r15), 3
; VECTOR-NEXT: wfaxb %v0, %v0, %v0
-; VECTOR-NEXT: wflrx %f0, %v0, 0, 3
-; VECTOR-NEXT: ledbra %f0, 0, %f0, 0
-; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: la %r2, 176(%r15)
+; VECTOR-NEXT: vst %v0, 176(%r15), 3
+; VECTOR-NEXT: brasl %r14, __trunctfhf2 at PLT
; VECTOR-NEXT: # kill: def $f0h killed $f0h def $f0s
; VECTOR-NEXT: vlgvf %r0, %v0, 0
; VECTOR-NEXT: srl %r0, 16
; VECTOR-NEXT: sth %r0, 0(%r13)
-; VECTOR-NEXT: lmg %r13, %r15, 264(%r15)
+; VECTOR-NEXT: lmg %r13, %r15, 296(%r15)
; VECTOR-NEXT: br %r14
entry:
%Op0 = load half, ptr %Src, align 2
@@ -682,3 +682,18 @@ entry:
call void @foo2(half 0.0, half 1.0, half 0.375)
ret void
}
+
+; Test a tail call.
+declare void @foo3(half)
+define void @fun12(half %Arg0) {
+; NOVEC-LABEL: fun12:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: jg foo3 at PLT
+;
+; VECTOR-LABEL: fun12:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: jg foo3 at PLT
+entry:
+ tail call void @foo3(half %Arg0)
+ ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll b/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
index a004a5f9a7bd12..4a38d7afba2c9d 100644
--- a/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
@@ -1,24 +1,8 @@
; Test that combined sin/cos library call is emitted when appropriate
-; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-OPT
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s --check-prefix=CHECK-OPT
; RUN: llc < %s -mtriple=s390x-linux-gnu -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK-OPT
-; Test f16 libcall.
-define half @f0(half %x) {
-; CHECK-OPT-LABEL: f0:
-; CHECK-OPT-NOT: brasl %r14, __extendhfsf2 at PLT
-; CHECK-OPT: brasl %r14, sinh at PLT
-; CHECK-OPT: brasl %r14, cosh at PLT
-; CHECK-OPT: brasl %r14, __extendhfsf2 at PLT
-; CHECK-OPT: brasl %r14, __extendhfsf2 at PLT
-; CHECK-OPT: aebr %f0, %f8
-; CHECK-OPT: brasl %r14, __truncsfhf2 at PLT
- %tmp1 = call half @sinh(half %x) readnone
- %tmp2 = call half @cosh(half %x) readnone
- %add = fadd half %tmp1, %tmp2
- ret half %add
-}
-
define float @f1(float %x) {
; CHECK-OPT-LABEL: f1:
; CHECK-OPT: brasl %r14, sincosf at PLT
@@ -86,11 +70,9 @@ define fp128 @f3_errno(fp128 %x) {
ret fp128 %add
}
-declare half @sinh(half)
declare float @sinf(float)
declare double @sin(double)
declare fp128 @sinl(fp128)
-declare half @cosh(half)
declare float @cosf(float)
declare double @cos(double)
declare fp128 @cosl(fp128)
More information about the cfe-commits
mailing list