[clang] [compiler-rt] [llvm] [SystemZ] Add support for half (fp16) (PR #109164)
Jonas Paulsson via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 6 12:15:19 PST 2024
https://github.com/JonPsson1 updated https://github.com/llvm/llvm-project/pull/109164
>From 180236bdc873b2b719ad6556ed9b93dbbd523aff Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Tue, 17 Sep 2024 19:34:34 +0200
Subject: [PATCH 1/4] Initial experiments (with integer regs for fp16).
---
clang/lib/Basic/Targets/SystemZ.h | 12 ++
clang/lib/CodeGen/Targets/SystemZ.cpp | 12 +-
clang/lib/Sema/SemaExpr.cpp | 2 +-
.../test/CodeGen/SystemZ/fexcess-precision.c | 85 ++++++++
clang/test/CodeGen/SystemZ/systemz-abi.c | 44 ++++
llvm/lib/Target/SystemZ/SystemZCallingConv.td | 4 +-
.../Target/SystemZ/SystemZISelLowering.cpp | 62 +++++-
llvm/lib/Target/SystemZ/SystemZISelLowering.h | 16 +-
llvm/test/CodeGen/SystemZ/fp-half.ll | 201 ++++++++++++++++++
9 files changed, 420 insertions(+), 18 deletions(-)
create mode 100644 clang/test/CodeGen/SystemZ/fexcess-precision.c
create mode 100644 llvm/test/CodeGen/SystemZ/fp-half.ll
diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h
index ef9a07033a6e4ff..f665a0b72a8e384 100644
--- a/clang/lib/Basic/Targets/SystemZ.h
+++ b/clang/lib/Basic/Targets/SystemZ.h
@@ -91,11 +91,23 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
"-v128:64-a:8:16-n32:64");
}
MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 128;
+
+ // True if the backend supports operations on the half LLVM IR type.
+ HasLegalHalfType = false;
+ // Allow half arguments and return values.
+ HalfArgsAndReturns = true;
+ // Support _Float16.
+ HasFloat16 = true;
+
HasStrictFP = true;
}
unsigned getMinGlobalAlign(uint64_t Size, bool HasNonWeakDef) const override;
+ bool useFP16ConversionIntrinsics() const override {
+ return false;
+ }
+
void getTargetDefines(const LangOptions &Opts,
MacroBuilder &Builder) const override;
diff --git a/clang/lib/CodeGen/Targets/SystemZ.cpp b/clang/lib/CodeGen/Targets/SystemZ.cpp
index 23c96fa5cf98cb3..5e313c999240af7 100644
--- a/clang/lib/CodeGen/Targets/SystemZ.cpp
+++ b/clang/lib/CodeGen/Targets/SystemZ.cpp
@@ -185,6 +185,8 @@ bool SystemZABIInfo::isFPArgumentType(QualType Ty) const {
if (const BuiltinType *BT = Ty->getAs<BuiltinType>())
switch (BT->getKind()) {
+// case BuiltinType::Half: // __fp16 Support __fp16??
+ case BuiltinType::Float16: // _Float16
case BuiltinType::Float:
case BuiltinType::Double:
return true;
@@ -277,7 +279,8 @@ RValue SystemZABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
} else {
if (AI.getCoerceToType())
ArgTy = AI.getCoerceToType();
- InFPRs = (!IsSoftFloatABI && (ArgTy->isFloatTy() || ArgTy->isDoubleTy()));
+ InFPRs = (!IsSoftFloatABI &&
+ (ArgTy->isHalfTy() || ArgTy->isFloatTy() || ArgTy->isDoubleTy()));
IsVector = ArgTy->isVectorTy();
UnpaddedSize = TyInfo.Width;
DirectAlign = TyInfo.Align;
@@ -446,10 +449,11 @@ ABIArgInfo SystemZABIInfo::classifyArgumentType(QualType Ty) const {
// The structure is passed as an unextended integer, a float, or a double.
if (isFPArgumentType(SingleElementTy)) {
- assert(Size == 32 || Size == 64);
+ assert(Size == 16 || Size == 32 || Size == 64);
return ABIArgInfo::getDirect(
- Size == 32 ? llvm::Type::getFloatTy(getVMContext())
- : llvm::Type::getDoubleTy(getVMContext()));
+ Size == 16 ? llvm::Type::getHalfTy(getVMContext())
+ : Size == 32 ? llvm::Type::getFloatTy(getVMContext())
+ : llvm::Type::getDoubleTy(getVMContext()));
} else {
llvm::IntegerType *PassTy = llvm::IntegerType::get(getVMContext(), Size);
return Size <= 32 ? ABIArgInfo::getNoExtend(PassTy)
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 7f3cff1054aeed5..9c8d821a775db84 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -16547,7 +16547,7 @@ ExprResult Sema::BuildVAArgExpr(SourceLocation BuiltinLoc,
PromoteType = QualType();
}
}
- if (TInfo->getType()->isSpecificBuiltinType(BuiltinType::Float))
+ if (TInfo->getType()->isFloat16Type() || TInfo->getType()->isFloat32Type())
PromoteType = Context.DoubleTy;
if (!PromoteType.isNull())
DiagRuntimeBehavior(TInfo->getTypeLoc().getBeginLoc(), E,
diff --git a/clang/test/CodeGen/SystemZ/fexcess-precision.c b/clang/test/CodeGen/SystemZ/fexcess-precision.c
new file mode 100644
index 000000000000000..4444dbdcc23ca05
--- /dev/null
+++ b/clang/test/CodeGen/SystemZ/fexcess-precision.c
@@ -0,0 +1,85 @@
+// RUN: %clang_cc1 -triple s390x-linux-gnu \
+// RUN: -ffloat16-excess-precision=standard -emit-llvm -o - %s \
+// RUN: | FileCheck %s -check-prefix=STANDARD
+
+// RUN: %clang_cc1 -triple s390x-linux-gnu \
+// RUN: -ffloat16-excess-precision=none -emit-llvm -o - %s \
+// RUN: | FileCheck %s -check-prefix=NONE
+
+// RUN: %clang_cc1 -triple s390x-linux-gnu \
+// RUN: -ffloat16-excess-precision=fast -emit-llvm -o - %s \
+// RUN: | FileCheck %s -check-prefix=FAST
+
+_Float16 f(_Float16 a, _Float16 b, _Float16 c, _Float16 d) {
+ return a * b + c * d;
+}
+
+// STANDARD-LABEL: define dso_local half @f(half noundef %a, half noundef %b, half noundef %c, half noundef %d) #0 {
+// STANDARD-NEXT: entry:
+// STANDARD-NEXT: %a.addr = alloca half, align 2
+// STANDARD-NEXT: %b.addr = alloca half, align 2
+// STANDARD-NEXT: %c.addr = alloca half, align 2
+// STANDARD-NEXT: %d.addr = alloca half, align 2
+// STANDARD-NEXT: store half %a, ptr %a.addr, align 2
+// STANDARD-NEXT: store half %b, ptr %b.addr, align 2
+// STANDARD-NEXT: store half %c, ptr %c.addr, align 2
+// STANDARD-NEXT: store half %d, ptr %d.addr, align 2
+// STANDARD-NEXT: %0 = load half, ptr %a.addr, align 2
+// STANDARD-NEXT: %ext = fpext half %0 to float
+// STANDARD-NEXT: %1 = load half, ptr %b.addr, align 2
+// STANDARD-NEXT: %ext1 = fpext half %1 to float
+// STANDARD-NEXT: %mul = fmul float %ext, %ext1
+// STANDARD-NEXT: %2 = load half, ptr %c.addr, align 2
+// STANDARD-NEXT: %ext2 = fpext half %2 to float
+// STANDARD-NEXT: %3 = load half, ptr %d.addr, align 2
+// STANDARD-NEXT: %ext3 = fpext half %3 to float
+// STANDARD-NEXT: %mul4 = fmul float %ext2, %ext3
+// STANDARD-NEXT: %add = fadd float %mul, %mul4
+// STANDARD-NEXT: %unpromotion = fptrunc float %add to half
+// STANDARD-NEXT: ret half %unpromotion
+// STANDARD-NEXT: }
+
+// NONE-LABEL: define dso_local half @f(half noundef %a, half noundef %b, half noundef %c, half noundef %d) #0 {
+// NONE-NEXT: entry:
+// NONE-NEXT: %a.addr = alloca half, align 2
+// NONE-NEXT: %b.addr = alloca half, align 2
+// NONE-NEXT: %c.addr = alloca half, align 2
+// NONE-NEXT: %d.addr = alloca half, align 2
+// NONE-NEXT: store half %a, ptr %a.addr, align 2
+// NONE-NEXT: store half %b, ptr %b.addr, align 2
+// NONE-NEXT: store half %c, ptr %c.addr, align 2
+// NONE-NEXT: store half %d, ptr %d.addr, align 2
+// NONE-NEXT: %0 = load half, ptr %a.addr, align 2
+// NONE-NEXT: %1 = load half, ptr %b.addr, align 2
+// NONE-NEXT: %mul = fmul half %0, %1
+// NONE-NEXT: %2 = load half, ptr %c.addr, align 2
+// NONE-NEXT: %3 = load half, ptr %d.addr, align 2
+// NONE-NEXT: %mul1 = fmul half %2, %3
+// NONE-NEXT: %add = fadd half %mul, %mul1
+// NONE-NEXT: ret half %add
+// NONE-NEXT: }
+
+// FAST-LABEL: define dso_local half @f(half noundef %a, half noundef %b, half noundef %c, half noundef %d) #0 {
+// FAST-NEXT: entry:
+// FAST-NEXT: %a.addr = alloca half, align 2
+// FAST-NEXT: %b.addr = alloca half, align 2
+// FAST-NEXT: %c.addr = alloca half, align 2
+// FAST-NEXT: %d.addr = alloca half, align 2
+// FAST-NEXT: store half %a, ptr %a.addr, align 2
+// FAST-NEXT: store half %b, ptr %b.addr, align 2
+// FAST-NEXT: store half %c, ptr %c.addr, align 2
+// FAST-NEXT: store half %d, ptr %d.addr, align 2
+// FAST-NEXT: %0 = load half, ptr %a.addr, align 2
+// FAST-NEXT: %ext = fpext half %0 to float
+// FAST-NEXT: %1 = load half, ptr %b.addr, align 2
+// FAST-NEXT: %ext1 = fpext half %1 to float
+// FAST-NEXT: %mul = fmul float %ext, %ext1
+// FAST-NEXT: %2 = load half, ptr %c.addr, align 2
+// FAST-NEXT: %ext2 = fpext half %2 to float
+// FAST-NEXT: %3 = load half, ptr %d.addr, align 2
+// FAST-NEXT: %ext3 = fpext half %3 to float
+// FAST-NEXT: %mul4 = fmul float %ext2, %ext3
+// FAST-NEXT: %add = fadd float %mul, %mul4
+// FAST-NEXT: %unpromotion = fptrunc float %add to half
+// FAST-NEXT: ret half %unpromotion
+// FAST-NEXT: }
diff --git a/clang/test/CodeGen/SystemZ/systemz-abi.c b/clang/test/CodeGen/SystemZ/systemz-abi.c
index fd2b5d450cc643e..2287126bdeabece 100644
--- a/clang/test/CodeGen/SystemZ/systemz-abi.c
+++ b/clang/test/CodeGen/SystemZ/systemz-abi.c
@@ -45,6 +45,9 @@ long long pass_longlong(long long arg) { return arg; }
__int128 pass_int128(__int128 arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_int128(ptr dead_on_unwind noalias writable sret(i128) align 8 %{{.*}}, ptr %0)
+_Float16 pass__Float16(_Float16 arg) { return arg; }
+// CHECK-LABEL: define{{.*}} half @pass__Float16(half %{{.*}})
+
float pass_float(float arg) { return arg; }
// CHECK-LABEL: define{{.*}} float @pass_float(float %{{.*}})
@@ -72,6 +75,9 @@ _Complex long pass_complex_long(_Complex long arg) { return arg; }
_Complex long long pass_complex_longlong(_Complex long long arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_complex_longlong(ptr dead_on_unwind noalias writable sret({ i64, i64 }) align 8 %{{.*}}, ptr %{{.*}}arg)
+_Complex _Float16 pass_complex__Float16(_Complex _Float16 arg) { return arg; }
+// CHECK-LABEL: define{{.*}} void @pass_complex__Float16(ptr dead_on_unwind noalias writable sret({ half, half }) align 2 %{{.*}}, ptr %{{.*}}arg)
+
_Complex float pass_complex_float(_Complex float arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_complex_float(ptr dead_on_unwind noalias writable sret({ float, float }) align 4 %{{.*}}, ptr %{{.*}}arg)
@@ -123,6 +129,11 @@ struct agg_16byte pass_agg_16byte(struct agg_16byte arg) { return arg; }
// Float-like aggregate types
+struct agg__Float16 { _Float16 a; };
+struct agg__Float16 pass_agg__Float16(struct agg__Float16 arg) { return arg; }
+// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, half %{{.*}})
+// SOFT-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, i16 noext %{{.*}})
+
struct agg_float { float a; };
struct agg_float pass_agg_float(struct agg_float arg) { return arg; }
// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg_float(ptr dead_on_unwind noalias writable sret(%struct.agg_float) align 4 %{{.*}}, float %{{.*}})
@@ -137,6 +148,11 @@ struct agg_longdouble { long double a; };
struct agg_longdouble pass_agg_longdouble(struct agg_longdouble arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_agg_longdouble(ptr dead_on_unwind noalias writable sret(%struct.agg_longdouble) align 8 %{{.*}}, ptr %{{.*}})
+struct agg__Float16_a8 { _Float16 a __attribute__((aligned (8))); };
+struct agg__Float16_a8 pass_agg__Float16_a8(struct agg__Float16_a8 arg) { return arg; }
+// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a8) align 8 %{{.*}}, double %{{.*}})
+// SOFT-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a8) align 8 %{{.*}}, i64 %{{.*}})
+
struct agg_float_a8 { float a __attribute__((aligned (8))); };
struct agg_float_a8 pass_agg_float_a8(struct agg_float_a8 arg) { return arg; }
// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg_float_a8(ptr dead_on_unwind noalias writable sret(%struct.agg_float_a8) align 8 %{{.*}}, double %{{.*}})
@@ -164,6 +180,10 @@ struct agg_nofloat3 pass_agg_nofloat3(struct agg_nofloat3 arg) { return arg; }
// Union types likewise are *not* float-like aggregate types
+union union__Float16 { _Float16 a; };
+union union__Float16 pass_union__Float16(union union__Float16 arg) { return arg; }
+// CHECK-LABEL: define{{.*}} void @pass_union__Float16(ptr dead_on_unwind noalias writable sret(%union.union__Float16) align 2 %{{.*}}, i16 noext %{{.*}})
+
union union_float { float a; };
union union_float pass_union_float(union union_float arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_union_float(ptr dead_on_unwind noalias writable sret(%union.union_float) align 4 %{{.*}}, i32 noext %{{.*}})
@@ -441,6 +461,30 @@ struct agg_8byte va_agg_8byte(__builtin_va_list l) { return __builtin_va_arg(l,
// CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi ptr [ [[RAW_REG_ADDR]], %{{.*}} ], [ [[RAW_MEM_ADDR]], %{{.*}} ]
// CHECK: ret void
+struct agg__Float16 va_agg__Float16(__builtin_va_list l) { return __builtin_va_arg(l, struct agg__Float16); }
+// CHECK-LABEL: define{{.*}} void @va_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, ptr %{{.*}}
+// HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 1
+// SOFT-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 0
+// CHECK: [[REG_COUNT:%[^ ]+]] = load i64, ptr [[REG_COUNT_PTR]]
+// HARD-FLOAT: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 4
+// SOFT-FLOAT: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5
+// CHECK: br i1 [[FITS_IN_REGS]],
+// CHECK: [[SCALED_REG_COUNT:%[^ ]+]] = mul i64 [[REG_COUNT]], 8
+// HARD-FLOAT: [[REG_OFFSET:%[^ ]+]] = add i64 [[SCALED_REG_COUNT]], 128
+// SOFT-FLOAT: [[REG_OFFSET:%[^ ]+]] = add i64 [[SCALED_REG_COUNT]], 22
+// CHECK: [[REG_SAVE_AREA_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 3
+// CHECK: [[REG_SAVE_AREA:%[^ ]+]] = load ptr, ptr [[REG_SAVE_AREA_PTR:[^ ]+]]
+// CHECK: [[RAW_REG_ADDR:%[^ ]+]] = getelementptr i8, ptr [[REG_SAVE_AREA]], i64 [[REG_OFFSET]]
+// CHECK: [[REG_COUNT1:%[^ ]+]] = add i64 [[REG_COUNT]], 1
+// CHECK: store i64 [[REG_COUNT1]], ptr [[REG_COUNT_PTR]]
+// CHECK: [[OVERFLOW_ARG_AREA_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 2
+// CHECK: [[OVERFLOW_ARG_AREA:%[^ ]+]] = load ptr, ptr [[OVERFLOW_ARG_AREA_PTR]]
+// CHECK: [[RAW_MEM_ADDR:%[^ ]+]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i64 6
+// CHECK: [[OVERFLOW_ARG_AREA2:%[^ ]+]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i64 8
+// CHECK: store ptr [[OVERFLOW_ARG_AREA2]], ptr [[OVERFLOW_ARG_AREA_PTR]]
+// CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi ptr [ [[RAW_REG_ADDR]], %{{.*}} ], [ [[RAW_MEM_ADDR]], %{{.*}} ]
+// CHECK: ret void
+
struct agg_float va_agg_float(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_float); }
// CHECK-LABEL: define{{.*}} void @va_agg_float(ptr dead_on_unwind noalias writable sret(%struct.agg_float) align 4 %{{.*}}, ptr %{{.*}}
// HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 1
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index 99bb697ce20142a..4f736e1c1718741 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -50,6 +50,7 @@ def RetCC_SystemZ_ELF : CallingConv<[
// other floating-point argument registers available for code that
// doesn't care about the ABI. All floating-point argument registers
// are call-clobbered, so we can use all of them here.
+ CCIfType<[f16], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
@@ -115,6 +116,7 @@ def CC_SystemZ_ELF : CallingConv<[
CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D, R6D]>>,
// The first 4 float and double arguments are passed in even registers F0-F6.
+ CCIfType<[f16], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
@@ -138,7 +140,7 @@ def CC_SystemZ_ELF : CallingConv<[
CCAssignToStack<16, 8>>>,
// Other arguments are passed in 8-byte-aligned 8-byte stack slots.
- CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
+ CCIfType<[i32, i64, f16, f32, f64], CCAssignToStack<8, 8>>
]>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 3999b54de81b657..36bf18d7dcc1a34 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -711,6 +711,13 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BITCAST, MVT::f32, Custom);
}
+ // Expand FP16 <=> FP32 conversions to libcalls and handle FP16 loads and
+ // stores in GPRs.
+ setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+ setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+
// VASTART and VACOPY need to deal with the SystemZ-specific varargs
// structure, but VAEND is a no-op.
setOperationAction(ISD::VASTART, MVT::Other, Custom);
@@ -784,6 +791,20 @@ bool SystemZTargetLowering::useSoftFloat() const {
return Subtarget.hasSoftFloat();
}
+MVT SystemZTargetLowering::getRegisterTypeForCallingConv(
+ LLVMContext &Context, CallingConv::ID CC,
+ EVT VT) const {
+ // 128-bit single-element vector types are passed like other vectors,
+ // not like their element type.
+ if (VT.isVector() && VT.getSizeInBits() == 128 &&
+ VT.getVectorNumElements() == 1)
+ return MVT::v16i8;
+ // Keep f16 so that they can be recognized and handled.
+ if (VT == MVT::f16)
+ return MVT::f16;
+ return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+}
+
EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL,
LLVMContext &, EVT VT) const {
if (!VT.isVector())
@@ -1602,6 +1623,15 @@ bool SystemZTargetLowering::splitValueIntoRegisterParts(
return true;
}
+ // Convert f16 to f32 (Out-arg).
+ if (PartVT == MVT::f16) {
+ assert(NumParts == 1 && "");
+ SDValue I16Val = DAG.getBitcast(MVT::i16, Val);
+ SDValue I32Val = DAG.getAnyExtOrTrunc(I16Val, DL, MVT::i32);
+ Parts[0] = DAG.getBitcast(MVT::f32, I32Val);
+ return true;
+ }
+
return false;
}
@@ -1617,6 +1647,18 @@ SDValue SystemZTargetLowering::joinRegisterPartsIntoValue(
return SDValue();
}
+// F32Val holds a f16 value in f32, return it as an f16 (In-arg). The
+// CopyFromReg was made into an f32 as required as FP32 registers are used
+// for arguments, now convert it to f16.
+static SDValue convertF32ToF16(SDValue F32Val, SelectionDAG &DAG,
+ const SDLoc &DL) {
+ assert(F32Val->getOpcode() == ISD::CopyFromReg &&
+ "Only expecting to handle f16 with CopyFromReg here.");
+ SDValue I32Val = DAG.getBitcast(MVT::i32, F32Val);
+ SDValue I16Val = DAG.getAnyExtOrTrunc(I32Val, DL, MVT::i16);
+ return DAG.getBitcast(MVT::f16, I16Val);
+}
+
SDValue SystemZTargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -1656,6 +1698,7 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
NumFixedGPRs += 1;
RC = &SystemZ::GR64BitRegClass;
break;
+ case MVT::f16:
case MVT::f32:
NumFixedFPRs += 1;
RC = &SystemZ::FP32BitRegClass;
@@ -1680,7 +1723,11 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
Register VReg = MRI.createVirtualRegister(RC);
MRI.addLiveIn(VA.getLocReg(), VReg);
- ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
+ // Special handling is needed for f16.
+ MVT ArgVT = VA.getLocVT() == MVT::f16 ? MVT::f32 : VA.getLocVT();
+ ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, ArgVT);
+ if (VA.getLocVT() == MVT::f16)
+ ArgValue = convertF32ToF16(ArgValue, DAG, DL);
} else {
assert(VA.isMemLoc() && "Argument not register or memory");
@@ -1700,9 +1747,12 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
// from this parameter. Unpromoted ints and floats are
// passed as right-justified 8-byte values.
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
- if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
+ if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32 ||
+ VA.getLocVT() == MVT::f16) {
+ unsigned SlotOffs = VA.getLocVT() == MVT::f16 ? 6 : 4;
FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
- DAG.getIntPtrConstant(4, DL));
+ DAG.getIntPtrConstant(SlotOffs, DL));
+ }
ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN,
MachinePointerInfo::getFixedStack(MF, FI));
}
@@ -2121,10 +2171,14 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
// Copy all of the result registers out of their specified physreg.
for (CCValAssign &VA : RetLocs) {
// Copy the value out, gluing the copy to the end of the call sequence.
+ // Special handling is needed for f16.
+ MVT ArgVT = VA.getLocVT() == MVT::f16 ? MVT::f32 : VA.getLocVT();
SDValue RetValue = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(),
- VA.getLocVT(), Glue);
+ ArgVT, Glue);
Chain = RetValue.getValue(1);
Glue = RetValue.getValue(2);
+ if (VA.getLocVT() == MVT::f16)
+ RetValue = convertF32ToF16(RetValue, DAG, DL);
// Convert the value of the return register into the value that's
// being returned.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 3c06c1fdf2b1bca..ecffa4deea0f5a6 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -447,14 +447,7 @@ class SystemZTargetLowering : public TargetLowering {
return TargetLowering::getNumRegisters(Context, VT);
}
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
- EVT VT) const override {
- // 128-bit single-element vector types are passed like other vectors,
- // not like their element type.
- if (VT.isVector() && VT.getSizeInBits() == 128 &&
- VT.getVectorNumElements() == 1)
- return MVT::v16i8;
- return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
- }
+ EVT VT) const override;
bool isCheapToSpeculateCtlz(Type *) const override { return true; }
bool isCheapToSpeculateCttz(Type *) const override { return true; }
bool preferZeroCompareBranch() const override { return true; }
@@ -476,6 +469,13 @@ class SystemZTargetLowering : public TargetLowering {
// LD, and having the full constant in memory enables reg/mem opcodes.
return VT != MVT::f64;
}
+ bool softPromoteHalfType() const override { return true; }
+ bool useFPRegsForHalfType() const override { return true; }
+ bool shouldKeepZExtForFP16Conv() const override {
+ // Keep the zero extension from 16 bits if present (as with incoming
+ // arguments).
+ return true;
+ }
bool hasInlineStackProbe(const MachineFunction &MF) const override;
AtomicExpansionKind shouldCastAtomicLoadInIR(LoadInst *LI) const override;
AtomicExpansionKind shouldCastAtomicStoreInIR(StoreInst *SI) const override;
diff --git a/llvm/test/CodeGen/SystemZ/fp-half.ll b/llvm/test/CodeGen/SystemZ/fp-half.ll
new file mode 100644
index 000000000000000..51ac42b012388ed
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-half.ll
@@ -0,0 +1,201 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s
+;
+; Tests for 16-bit floating point (half).
+
+; Incoming half arguments added together and returned.
+define half @fun0(half %Op0, half %Op1) {
+; CHECK-LABEL: fun0:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
+; CHECK-NEXT: .cfi_offset %r13, -56
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -168
+; CHECK-NEXT: .cfi_def_cfa_offset 328
+; CHECK-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: vlgvf %r0, %v2, 0
+; CHECK-NEXT: llghr %r2, %r0
+; CHECK-NEXT: vlgvf %r13, %v0, 0
+; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
+; CHECK-NEXT: llghr %r2, %r13
+; CHECK-NEXT: ldr %f8, %f0
+; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
+; CHECK-NEXT: aebr %f0, %f8
+; CHECK-NEXT: brasl %r14, __gnu_f2h_ieee at PLT
+; CHECK-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: vlvgf %v0, %r2, 0
+; CHECK-NEXT: lmg %r13, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+entry:
+ %Res = fadd half %Op0, %Op1
+ ret half %Res
+}
+
+; The half values are loaded and stored instead.
+define void @fun1(ptr %Op0, ptr %Op1, ptr %Dst) {
+; CHECK-LABEL: fun1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: stmg %r12, %r15, 96(%r15)
+; CHECK-NEXT: .cfi_offset %r12, -64
+; CHECK-NEXT: .cfi_offset %r13, -56
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -168
+; CHECK-NEXT: .cfi_def_cfa_offset 328
+; CHECK-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: llgh %r12, 0(%r2)
+; CHECK-NEXT: llgh %r2, 0(%r3)
+; CHECK-NEXT: lgr %r13, %r4
+; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
+; CHECK-NEXT: lgr %r2, %r12
+; CHECK-NEXT: ldr %f8, %f0
+; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
+; CHECK-NEXT: aebr %f0, %f8
+; CHECK-NEXT: brasl %r14, __gnu_f2h_ieee at PLT
+; CHECK-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: sth %r2, 0(%r13)
+; CHECK-NEXT: lmg %r12, %r15, 264(%r15)
+; CHECK-NEXT: br %r14
+entry:
+ %0 = load half, ptr %Op0, align 2
+ %1 = load half, ptr %Op1, align 2
+ %add = fadd half %0, %1
+ store half %add, ptr %Dst, align 2
+ ret void
+}
+
+; Test a chain of half operations which should have each operation surrounded
+; by conversions to/from fp32 for proper emulation.
+define half @fun2(half %Op0, half %Op1, half %Op2) {
+; CHECK-LABEL: fun2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: stmg %r12, %r15, 96(%r15)
+; CHECK-NEXT: .cfi_offset %r12, -64
+; CHECK-NEXT: .cfi_offset %r13, -56
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -168
+; CHECK-NEXT: .cfi_def_cfa_offset 328
+; CHECK-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: vlgvf %r0, %v2, 0
+; CHECK-NEXT: llghr %r2, %r0
+; CHECK-NEXT: vlgvf %r13, %v4, 0
+; CHECK-NEXT: vlgvf %r12, %v0, 0
+; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
+; CHECK-NEXT: llghr %r2, %r12
+; CHECK-NEXT: ldr %f8, %f0
+; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
+; CHECK-NEXT: aebr %f0, %f8
+; CHECK-NEXT: brasl %r14, __gnu_f2h_ieee at PLT
+; CHECK-NEXT: llghr %r2, %r2
+; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
+; CHECK-NEXT: llghr %r2, %r13
+; CHECK-NEXT: ldr %f8, %f0
+; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
+; CHECK-NEXT: wfasb %f0, %f8, %f0
+; CHECK-NEXT: brasl %r14, __gnu_f2h_ieee at PLT
+; CHECK-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: vlvgf %v0, %r2, 0
+; CHECK-NEXT: lmg %r12, %r15, 264(%r15)
+; CHECK-NEXT: br %r14
+entry:
+ %A0 = fadd half %Op0, %Op1
+ %Res = fadd half %A0, %Op2
+ ret half %Res
+}
+
+; Store an incoming half argument and return a loaded one.
+define half @fun3(half %Op0, ptr %Dst, ptr %Src) {
+; CHECK-LABEL: fun3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vlgvf %r0, %v0, 0
+; CHECK-NEXT: sth %r0, 0(%r2)
+; CHECK-NEXT: lh %r0, 0(%r3)
+; CHECK-NEXT: vlvgf %v0, %r0, 0
+; CHECK-NEXT: br %r14
+entry:
+ store half %Op0, ptr %Dst
+
+ %Res = load half, ptr %Src
+ ret half %Res
+}
+
+; Call a function with half argument and return values.
+declare half @foo(half)
+define void @fun4(ptr %Src, ptr %Dst) {
+; CHECK-LABEL: fun4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
+; CHECK-NEXT: .cfi_offset %r13, -56
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: lh %r0, 0(%r2)
+; CHECK-NEXT: vlvgf %v0, %r0, 0
+; CHECK-NEXT: lgr %r13, %r3
+; CHECK-NEXT: brasl %r14, foo at PLT
+; CHECK-NEXT: vlgvf %r0, %v0, 0
+; CHECK-NEXT: sth %r0, 0(%r13)
+; CHECK-NEXT: lmg %r13, %r15, 264(%r15)
+; CHECK-NEXT: br %r14
+entry:
+ %arg = load half, ptr %Src
+ %Res = call half @foo(half %arg)
+ store half %Res, ptr %Dst
+ ret void
+}
+
+; Receive stack argument.
+define half @bar(half %Arg0, half %Arg1, half %Arg2, half %Arg3, half %Arg4) {
+; CHECK-LABEL: bar:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
+; CHECK-NEXT: .cfi_offset %r13, -56
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -168
+; CHECK-NEXT: .cfi_def_cfa_offset 328
+; CHECK-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset %f8, -168
+; CHECK-NEXT: vlgvf %r0, %v6, 0
+; CHECK-NEXT: llgh %r13, 334(%r15)
+; CHECK-NEXT: llghr %r2, %r0
+; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
+; CHECK-NEXT: lgr %r2, %r13
+; CHECK-NEXT: ldr %f8, %f0
+; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
+; CHECK-NEXT: wfasb %f0, %f8, %f0
+; CHECK-NEXT: brasl %r14, __gnu_f2h_ieee at PLT
+; CHECK-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
+; CHECK-NEXT: vlvgf %v0, %r2, 0
+; CHECK-NEXT: lmg %r13, %r15, 272(%r15)
+; CHECK-NEXT: br %r14
+ %A0 = fadd half %Arg3, %Arg4
+ ret half %A0
+}
+
+; Pass stack argument.
+define void @fun5() {
+; CHECK-LABEL: fun5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -168
+; CHECK-NEXT: .cfi_def_cfa_offset 328
+; CHECK-NEXT: lzer %f0
+; CHECK-NEXT: ldr %f2, %f0
+; CHECK-NEXT: ldr %f4, %f0
+; CHECK-NEXT: ldr %f6, %f0
+; CHECK-NEXT: mvhi 164(%r15), 0
+; CHECK-NEXT: brasl %r14, bar at PLT
+; CHECK-NEXT: lmg %r14, %r15, 280(%r15)
+; CHECK-NEXT: br %r14
+ call void @bar (half 0.0, half 0.0, half 0.0, half 0.0, half 0.0)
+ ret void
+}
>From f95163d40d3c11be13f34101f9bcc0df3b7e09e3 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Mon, 28 Oct 2024 15:44:42 +0100
Subject: [PATCH 2/4] Experiment with soft-promotion in FP regs (not working).
---
llvm/lib/Target/SystemZ/SystemZCallingConv.td | 6 ++---
.../Target/SystemZ/SystemZISelLowering.cpp | 23 ++++++++++++-------
llvm/lib/Target/SystemZ/SystemZISelLowering.h | 5 ----
llvm/test/CodeGen/SystemZ/fp-half.ll | 1 -
4 files changed, 17 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index 4f736e1c1718741..6be1b513d10851a 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -50,8 +50,7 @@ def RetCC_SystemZ_ELF : CallingConv<[
// other floating-point argument registers available for code that
// doesn't care about the ABI. All floating-point argument registers
// are call-clobbered, so we can use all of them here.
- CCIfType<[f16], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
- CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+ CCIfType<[f16, f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
// Similarly for vectors, with V24 being the ABI-compliant choice.
@@ -116,8 +115,7 @@ def CC_SystemZ_ELF : CallingConv<[
CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D, R6D]>>,
// The first 4 float and double arguments are passed in even registers F0-F6.
- CCIfType<[f16], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
- CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+ CCIfType<[f16, f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
// The first 8 named vector arguments are passed in V24-V31. Sub-128 vectors
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 36bf18d7dcc1a34..a57cb19d84b16f0 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -799,7 +799,7 @@ MVT SystemZTargetLowering::getRegisterTypeForCallingConv(
if (VT.isVector() && VT.getSizeInBits() == 128 &&
VT.getVectorNumElements() == 1)
return MVT::v16i8;
- // Keep f16 so that they can be recognized and handled.
+ // Keep f16 so it can be recognized and handled.
if (VT == MVT::f16)
return MVT::f16;
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
@@ -1625,10 +1625,13 @@ bool SystemZTargetLowering::splitValueIntoRegisterParts(
// Convert f16 to f32 (Out-arg).
if (PartVT == MVT::f16) {
- assert(NumParts == 1 && "");
- SDValue I16Val = DAG.getBitcast(MVT::i16, Val);
- SDValue I32Val = DAG.getAnyExtOrTrunc(I16Val, DL, MVT::i32);
- Parts[0] = DAG.getBitcast(MVT::f32, I32Val);
+ assert(NumParts == 1 && "f16 only needs one register.");
+ SDValue F16Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8f16,
+ DAG.getUNDEF(MVT::v8f16), Val,
+ DAG.getVectorIdxConstant(0, DL));
+ SDValue F32Vec = DAG.getBitcast(MVT::v4f32, F16Vec);
+ Parts[0] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
+ F32Vec, DAG.getVectorIdxConstant(0, DL));
return true;
}
@@ -1654,9 +1657,13 @@ static SDValue convertF32ToF16(SDValue F32Val, SelectionDAG &DAG,
const SDLoc &DL) {
assert(F32Val->getOpcode() == ISD::CopyFromReg &&
"Only expecting to handle f16 with CopyFromReg here.");
- SDValue I32Val = DAG.getBitcast(MVT::i32, F32Val);
- SDValue I16Val = DAG.getAnyExtOrTrunc(I32Val, DL, MVT::i16);
- return DAG.getBitcast(MVT::f16, I16Val);
+
+ SDValue F32Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
+ DAG.getUNDEF(MVT::v4f32), F32Val,
+ DAG.getVectorIdxConstant(0, DL));
+ SDValue F16Vec = DAG.getBitcast(MVT::v8f16, F32Vec);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16,
+ F16Vec, DAG.getVectorIdxConstant(0, DL));
}
SDValue SystemZTargetLowering::LowerFormalArguments(
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index ecffa4deea0f5a6..f67d4a27cc03348 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -471,11 +471,6 @@ class SystemZTargetLowering : public TargetLowering {
}
bool softPromoteHalfType() const override { return true; }
bool useFPRegsForHalfType() const override { return true; }
- bool shouldKeepZExtForFP16Conv() const override {
- // Keep the zero extension from 16 bits if present (as with incoming
- // arguments).
- return true;
- }
bool hasInlineStackProbe(const MachineFunction &MF) const override;
AtomicExpansionKind shouldCastAtomicLoadInIR(LoadInst *LI) const override;
AtomicExpansionKind shouldCastAtomicStoreInIR(StoreInst *SI) const override;
diff --git a/llvm/test/CodeGen/SystemZ/fp-half.ll b/llvm/test/CodeGen/SystemZ/fp-half.ll
index 51ac42b012388ed..6458aae7055e412 100644
--- a/llvm/test/CodeGen/SystemZ/fp-half.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-half.ll
@@ -119,7 +119,6 @@ define half @fun3(half %Op0, ptr %Dst, ptr %Src) {
; CHECK-NEXT: br %r14
entry:
store half %Op0, ptr %Dst
-
%Res = load half, ptr %Src
ret half %Res
}
>From d58853c0dd5718c8f8a33920d2f75b18d248d872 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Tue, 29 Oct 2024 10:57:54 +0100
Subject: [PATCH 3/4] Try to make f16 legal instead
---
clang/lib/Basic/Targets/SystemZ.h | 5 +
clang/lib/Sema/SemaExpr.cpp | 2 +-
compiler-rt/test/builtins/CMakeLists.txt | 2 +-
llvm/lib/IR/RuntimeLibcalls.cpp | 2 +
.../SystemZ/AsmParser/SystemZAsmParser.cpp | 7 +
.../MCTargetDesc/SystemZMCTargetDesc.cpp | 7 +
.../MCTargetDesc/SystemZMCTargetDesc.h | 1 +
llvm/lib/Target/SystemZ/SystemZCallingConv.td | 6 +-
.../Target/SystemZ/SystemZISelLowering.cpp | 227 +++--
llvm/lib/Target/SystemZ/SystemZISelLowering.h | 16 +-
llvm/lib/Target/SystemZ/SystemZInstrFP.td | 7 +-
llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 2 +
.../lib/Target/SystemZ/SystemZRegisterInfo.td | 17 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ13.td | 4 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ14.td | 4 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ15.td | 4 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ16.td | 4 +-
.../lib/Target/SystemZ/SystemZScheduleZ196.td | 4 +-
.../Target/SystemZ/SystemZScheduleZEC12.td | 4 +-
llvm/test/CodeGen/SystemZ/fp-half.ll | 789 ++++++++++++++----
llvm/test/CodeGen/SystemZ/fp-round-03.ll | 12 +
llvm/test/CodeGen/SystemZ/fp-sincos-01.ll | 18 +
llvm/test/CodeGen/SystemZ/twoaddr-kill.mir | 8 +-
23 files changed, 906 insertions(+), 246 deletions(-)
diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h
index f665a0b72a8e384..e23f9960948fd42 100644
--- a/clang/lib/Basic/Targets/SystemZ.h
+++ b/clang/lib/Basic/Targets/SystemZ.h
@@ -93,6 +93,11 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 128;
// True if the backend supports operations on the half LLVM IR type.
+ // By setting this to false, conversions will happen for _Float16 around
+ // a statement by default with operations done in float. However, if
+ // -ffloat16-excess-precision=none is given, no conversions will be made
+ // and instead the backend will promote each half operation to float
+ // individually.
HasLegalHalfType = false;
// Allow half arguments and return values.
HalfArgsAndReturns = true;
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 9c8d821a775db84..7f3cff1054aeed5 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -16547,7 +16547,7 @@ ExprResult Sema::BuildVAArgExpr(SourceLocation BuiltinLoc,
PromoteType = QualType();
}
}
- if (TInfo->getType()->isFloat16Type() || TInfo->getType()->isFloat32Type())
+ if (TInfo->getType()->isSpecificBuiltinType(BuiltinType::Float))
PromoteType = Context.DoubleTy;
if (!PromoteType.isNull())
DiagRuntimeBehavior(TInfo->getTypeLoc().getBeginLoc(), E,
diff --git a/compiler-rt/test/builtins/CMakeLists.txt b/compiler-rt/test/builtins/CMakeLists.txt
index 8fdcec6029a2a1a..63f4c94605c9075 100644
--- a/compiler-rt/test/builtins/CMakeLists.txt
+++ b/compiler-rt/test/builtins/CMakeLists.txt
@@ -56,7 +56,7 @@ foreach(arch ${BUILTIN_TEST_ARCH})
string(REPLACE ";" " " BUILTINS_TEST_TARGET_CFLAGS "${BUILTINS_TEST_TARGET_CFLAGS}")
endif()
else()
- if (${arch} MATCHES "arm|armhf|aarch64|arm64|i?86|x86_64|AMD64|riscv32|riscv64" AND COMPILER_RT_HAS_${arch}_FLOAT16)
+ if (${arch} MATCHES "arm|armhf|aarch64|arm64|i?86|x86_64|AMD64|riscv32|riscv64|s390x" AND COMPILER_RT_HAS_${arch}_FLOAT16)
list(APPEND BUILTINS_TEST_TARGET_CFLAGS -DCOMPILER_RT_HAS_FLOAT16)
string(REPLACE ";" " " BUILTINS_TEST_TARGET_CFLAGS "${BUILTINS_TEST_TARGET_CFLAGS}")
endif()
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 06167559a77697d..e107c439c21a125 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -327,6 +327,8 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
setLibcallName(RTLIB::SRL_I128, nullptr);
setLibcallName(RTLIB::SHL_I128, nullptr);
setLibcallName(RTLIB::SRA_I128, nullptr);
+ setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
+ setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
}
if (TT.isX86()) {
diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index b8469a6ba70eafd..5988fac8041b826 100644
--- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -60,6 +60,7 @@ enum RegisterKind {
GRH32Reg,
GR64Reg,
GR128Reg,
+ FP16Reg,
FP32Reg,
FP64Reg,
FP128Reg,
@@ -356,6 +357,7 @@ class SystemZOperand : public MCParsedAsmOperand {
bool isADDR32() const { return isReg(GR32Reg); }
bool isADDR64() const { return isReg(GR64Reg); }
bool isADDR128() const { return false; }
+ bool isFP16() const { return isReg(FP16Reg); }
bool isFP32() const { return isReg(FP32Reg); }
bool isFP64() const { return isReg(FP64Reg); }
bool isFP128() const { return isReg(FP128Reg); }
@@ -534,6 +536,9 @@ class SystemZAsmParser : public MCTargetAsmParser {
ParseStatus parseADDR128(OperandVector &Operands) {
llvm_unreachable("Shouldn't be used as an operand");
}
+ ParseStatus parseFP16(OperandVector &Operands) {
+ return parseRegister(Operands, FP16Reg);
+ }
ParseStatus parseFP32(OperandVector &Operands) {
return parseRegister(Operands, FP32Reg);
}
@@ -829,6 +834,7 @@ ParseStatus SystemZAsmParser::parseRegister(OperandVector &Operands,
case GR128Reg:
Group = RegGR;
break;
+ case FP16Reg:
case FP32Reg:
case FP64Reg:
case FP128Reg:
@@ -882,6 +888,7 @@ ParseStatus SystemZAsmParser::parseRegister(OperandVector &Operands,
case GRH32Reg: Regs = SystemZMC::GRH32Regs; break;
case GR64Reg: Regs = SystemZMC::GR64Regs; break;
case GR128Reg: Regs = SystemZMC::GR128Regs; break;
+ case FP16Reg: Regs = SystemZMC::FP16Regs; break;
case FP32Reg: Regs = SystemZMC::FP32Regs; break;
case FP64Reg: Regs = SystemZMC::FP64Regs; break;
case FP128Reg: Regs = SystemZMC::FP128Regs; break;
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 333221c46ebb8bf..7b8ed4b679c01c3 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -61,6 +61,13 @@ const unsigned SystemZMC::GR128Regs[16] = {
SystemZ::R12Q, 0, SystemZ::R14Q, 0
};
+const unsigned SystemZMC::FP16Regs[16] = {
+ SystemZ::F0H, SystemZ::F1H, SystemZ::F2H, SystemZ::F3H,
+ SystemZ::F4H, SystemZ::F5H, SystemZ::F6H, SystemZ::F7H,
+ SystemZ::F8H, SystemZ::F9H, SystemZ::F10H, SystemZ::F11H,
+ SystemZ::F12H, SystemZ::F13H, SystemZ::F14H, SystemZ::F15H
+};
+
const unsigned SystemZMC::FP32Regs[16] = {
SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S,
SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S,
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
index 39c1836a137005c..806463707f58e65 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -43,6 +43,7 @@ extern const unsigned GR32Regs[16];
extern const unsigned GRH32Regs[16];
extern const unsigned GR64Regs[16];
extern const unsigned GR128Regs[16];
+extern const unsigned FP16Regs[16];
extern const unsigned FP32Regs[16];
extern const unsigned FP64Regs[16];
extern const unsigned FP128Regs[16];
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index 6be1b513d10851a..0ad872bcb63a749 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -50,7 +50,8 @@ def RetCC_SystemZ_ELF : CallingConv<[
// other floating-point argument registers available for code that
// doesn't care about the ABI. All floating-point argument registers
// are call-clobbered, so we can use all of them here.
- CCIfType<[f16, f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+ CCIfType<[f16], CCAssignToReg<[F0H, F2H, F4H, F6H]>>,
+ CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
// Similarly for vectors, with V24 being the ABI-compliant choice.
@@ -115,7 +116,8 @@ def CC_SystemZ_ELF : CallingConv<[
CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D, R6D]>>,
// The first 4 float and double arguments are passed in even registers F0-F6.
- CCIfType<[f16, f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+ CCIfType<[f16], CCAssignToReg<[F0H, F2H, F4H, F6H]>>,
+ CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
// The first 8 named vector arguments are passed in V24-V31. Sub-128 vectors
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index a57cb19d84b16f0..ecef68a90957226 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -102,6 +102,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::i32, &SystemZ::GR32BitRegClass);
addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass);
if (!useSoftFloat()) {
+ addRegisterClass(MVT::f16, &SystemZ::FP16BitRegClass);
if (Subtarget.hasVector()) {
addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass);
addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass);
@@ -513,11 +514,35 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
}
// Handle floating-point types.
+ // Promote all f16 operations to float, with some exceptions below.
+ for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+ setOperationAction(Opc, MVT::f16, Promote);
+ setOperationAction(ISD::ConstantFP, MVT::f16, Expand);
+ for (MVT VT : {MVT::f32, MVT::f64, MVT::f128}) {
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
+ setTruncStoreAction(VT, MVT::f16, Expand);
+ }
+ setOperationAction(ISD::LOAD, MVT::f16, Custom);
+ setOperationAction(ISD::STORE, MVT::f16, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+
for (unsigned I = MVT::FIRST_FP_VALUETYPE;
I <= MVT::LAST_FP_VALUETYPE;
++I) {
MVT VT = MVT::SimpleValueType(I);
if (isTypeLegal(VT)) {
+ // No special instructions for these.
+ setOperationAction(ISD::FSIN, VT, Expand);
+ setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
+ setOperationAction(ISD::FREM, VT, Expand);
+ setOperationAction(ISD::FPOW, VT, Expand);
+ if (VT == MVT::f16)
+ continue;
+
// We can use FI for FRINT.
setOperationAction(ISD::FRINT, VT, Legal);
@@ -530,13 +555,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FROUND, VT, Legal);
}
- // No special instructions for these.
- setOperationAction(ISD::FSIN, VT, Expand);
- setOperationAction(ISD::FCOS, VT, Expand);
- setOperationAction(ISD::FSINCOS, VT, Expand);
- setOperationAction(ISD::FREM, VT, Expand);
- setOperationAction(ISD::FPOW, VT, Expand);
-
// Special treatment.
setOperationAction(ISD::IS_FPCLASS, VT, Custom);
@@ -711,13 +729,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BITCAST, MVT::f32, Custom);
}
- // Expand FP16 <=> FP32 conversions to libcalls and handle FP16 loads and
- // stores in GPRs.
- setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
- setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
- setTruncStoreAction(MVT::f32, MVT::f16, Expand);
-
// VASTART and VACOPY need to deal with the SystemZ-specific varargs
// structure, but VAEND is a no-op.
setOperationAction(ISD::VASTART, MVT::Other, Custom);
@@ -773,6 +784,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
// Default to having -disable-strictnode-mutation on
IsStrictFPEnabled = true;
+ setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
+ setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
+
if (Subtarget.isTargetzOS()) {
struct RTLibCallMapping {
RTLIB::Libcall Code;
@@ -791,20 +805,6 @@ bool SystemZTargetLowering::useSoftFloat() const {
return Subtarget.hasSoftFloat();
}
-MVT SystemZTargetLowering::getRegisterTypeForCallingConv(
- LLVMContext &Context, CallingConv::ID CC,
- EVT VT) const {
- // 128-bit single-element vector types are passed like other vectors,
- // not like their element type.
- if (VT.isVector() && VT.getSizeInBits() == 128 &&
- VT.getVectorNumElements() == 1)
- return MVT::v16i8;
- // Keep f16 so it can be recognized and handled.
- if (VT == MVT::f16)
- return MVT::f16;
- return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
-}
-
EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL,
LLVMContext &, EVT VT) const {
if (!VT.isVector())
@@ -954,6 +954,10 @@ SystemZVectorConstantInfo::SystemZVectorConstantInfo(BuildVectorSDNode *BVN) {
bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const {
+ // TODO: Loading all f16 constants from ConstantPool for now.
+ if (&Imm.getSemantics() == &APFloat::IEEEhalf())
+ return false;
+
// We can load zero using LZ?R and negative zero using LZ?R;LC?BR.
if (Imm.isZero() || Imm.isNegZero())
return true;
@@ -1623,18 +1627,6 @@ bool SystemZTargetLowering::splitValueIntoRegisterParts(
return true;
}
- // Convert f16 to f32 (Out-arg).
- if (PartVT == MVT::f16) {
- assert(NumParts == 1 && "f16 only needs one register.");
- SDValue F16Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8f16,
- DAG.getUNDEF(MVT::v8f16), Val,
- DAG.getVectorIdxConstant(0, DL));
- SDValue F32Vec = DAG.getBitcast(MVT::v4f32, F16Vec);
- Parts[0] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
- F32Vec, DAG.getVectorIdxConstant(0, DL));
- return true;
- }
-
return false;
}
@@ -1650,22 +1642,6 @@ SDValue SystemZTargetLowering::joinRegisterPartsIntoValue(
return SDValue();
}
-// F32Val holds a f16 value in f32, return it as an f16 (In-arg). The
-// CopyFromReg was made into an f32 as required as FP32 registers are used
-// for arguments, now convert it to f16.
-static SDValue convertF32ToF16(SDValue F32Val, SelectionDAG &DAG,
- const SDLoc &DL) {
- assert(F32Val->getOpcode() == ISD::CopyFromReg &&
- "Only expecting to handle f16 with CopyFromReg here.");
-
- SDValue F32Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
- DAG.getUNDEF(MVT::v4f32), F32Val,
- DAG.getVectorIdxConstant(0, DL));
- SDValue F16Vec = DAG.getBitcast(MVT::v8f16, F32Vec);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16,
- F16Vec, DAG.getVectorIdxConstant(0, DL));
-}
-
SDValue SystemZTargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -1706,6 +1682,9 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
RC = &SystemZ::GR64BitRegClass;
break;
case MVT::f16:
+ NumFixedFPRs += 1;
+ RC = &SystemZ::FP16BitRegClass;
+ break;
case MVT::f32:
NumFixedFPRs += 1;
RC = &SystemZ::FP32BitRegClass;
@@ -1730,11 +1709,7 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
Register VReg = MRI.createVirtualRegister(RC);
MRI.addLiveIn(VA.getLocReg(), VReg);
- // Special handling is needed for f16.
- MVT ArgVT = VA.getLocVT() == MVT::f16 ? MVT::f32 : VA.getLocVT();
- ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, ArgVT);
- if (VA.getLocVT() == MVT::f16)
- ArgValue = convertF32ToF16(ArgValue, DAG, DL);
+ ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
} else {
assert(VA.isMemLoc() && "Argument not register or memory");
@@ -2072,6 +2047,8 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
VA.getLocMemOffset();
if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
Offset += 4;
+ else if (VA.getLocVT() == MVT::f16)
+ Offset += 6;
SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
DAG.getIntPtrConstant(Offset, DL));
@@ -2178,14 +2155,10 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
// Copy all of the result registers out of their specified physreg.
for (CCValAssign &VA : RetLocs) {
// Copy the value out, gluing the copy to the end of the call sequence.
- // Special handling is needed for f16.
- MVT ArgVT = VA.getLocVT() == MVT::f16 ? MVT::f32 : VA.getLocVT();
SDValue RetValue = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(),
- ArgVT, Glue);
+ VA.getLocVT(), Glue);
Chain = RetValue.getValue(1);
Glue = RetValue.getValue(2);
- if (VA.getLocVT() == MVT::f16)
- RetValue = convertF32ToF16(RetValue, DAG, DL);
// Convert the value of the return register into the value that's
// being returned.
@@ -6169,6 +6142,118 @@ static SDValue lowerAddrSpaceCast(SDValue Op, SelectionDAG &DAG) {
return Op;
}
+SDValue SystemZTargetLowering::LowerFP_EXTEND(SDValue Op,
+ SelectionDAG &DAG) const {
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue In = Op.getOperand(IsStrict ? 1 : 0);
+ MVT VT = Op.getSimpleValueType();
+ MVT SVT = In.getSimpleValueType();
+ if (SVT != MVT::f16)
+ return Op;
+
+ SDLoc DL(Op);
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+
+ // Need a libcall. XXX factor out (below)
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ Entry.Node = In;
+ Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
+ Args.push_back(Entry);
+ SDValue Callee = DAG.getExternalSymbol(
+ getLibcallName(RTLIB::FPEXT_F16_F32), getPointerTy(DAG.getDataLayout()));
+ CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
+ CallingConv::C, EVT(MVT::f32).getTypeForEVT(*DAG.getContext()), Callee,
+ std::move(Args));
+ SDValue Res;
+ std::tie(Res,Chain) = LowerCallTo(CLI);
+ if (IsStrict)
+ Res = DAG.getMergeValues({Res, Chain}, DL);
+
+ return DAG.getNode(ISD::FP_EXTEND, DL, VT, Res);
+}
+
+SDValue SystemZTargetLowering::LowerFP_ROUND(SDValue Op,
+ SelectionDAG &DAG) const {
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue In = Op.getOperand(IsStrict ? 1 : 0);
+ MVT VT = Op.getSimpleValueType();
+ MVT SVT = In.getSimpleValueType();
+ if (VT != MVT::f16)
+ return SDValue(); // XXX?
+
+ SDLoc DL(Op);
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+
+ if (SVT != MVT::f32) {
+ SDValue Rnd = DAG.getIntPtrConstant(0, DL, /*isTarget=*/true);
+ In = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, In, Rnd);
+ }
+
+ // We need a libcall.
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ Entry.Node = In;
+ Entry.Ty = EVT(MVT::f32).getTypeForEVT(*DAG.getContext());
+ Args.push_back(Entry);
+ SDValue Callee = DAG.getExternalSymbol(
+ getLibcallName(RTLIB::FPROUND_F32_F16), getPointerTy(DAG.getDataLayout()));
+ CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
+ CallingConv::C, EVT(MVT::f16).getTypeForEVT(*DAG.getContext()), Callee,
+ std::move(Args));
+ SDValue Res;
+ std::tie(Res, Chain) = LowerCallTo(CLI);
+ if (IsStrict)
+ Res = DAG.getMergeValues({Res, Chain}, DL);
+ return Res;
+}
+
+SDValue SystemZTargetLowering::lowerLoadF16(SDValue Op,
+ SelectionDAG &DAG) const {
+ MVT RegVT = Op.getSimpleValueType();
+ if (RegVT != MVT::f16)
+ return SDValue();
+ LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
+ SDLoc DL(Ld);
+ assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending f16 load");
+ // Load as integer, shift and insert into upper 2 bytes of the FP register.
+ // TODO: Use VLEH if available.
+ SDValue NewLd = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Ld->getChain(),
+ Ld->getBasePtr(), Ld->getPointerInfo(),
+ MVT::i16, Ld->getOriginalAlign(),
+ Ld->getMemOperand()->getFlags());
+ SDValue Shft = DAG.getNode(ISD::SHL, DL, MVT::i32, NewLd,
+ DAG.getConstant(16, DL, MVT::i32));
+ SDValue BCast = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Shft);
+ SDValue F16Val = DAG.getTargetExtractSubreg(SystemZ::subreg_h16,
+ DL, MVT::f16, BCast);
+ return DAG.getMergeValues({F16Val, NewLd.getValue(1)}, DL);
+}
+
+SDValue SystemZTargetLowering::lowerStoreF16(SDValue Op,
+ SelectionDAG &DAG) const {
+ StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
+ SDLoc DL(St);
+ SDValue StoredVal = St->getValue();
+ MVT StoreVT = StoredVal.getSimpleValueType();
+ if (StoreVT != MVT::f16)
+ return SDValue();
+ // Move into a GPR, shift and store the 2 bytes.
+ // TODO: Use VSTEH if available.
+ SDNode *U32 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f32);
+ SDValue In32 = DAG.getTargetInsertSubreg(SystemZ::subreg_h16, DL,
+ MVT::f32, SDValue(U32, 0), StoredVal);
+ SDValue BCast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, In32);
+ SDValue Shft = DAG.getNode(ISD::SRL, DL, MVT::i32, BCast,
+ DAG.getConstant(16, DL, MVT::i32));
+ return DAG.getTruncStore(St->getChain(), DL, Shft, St->getBasePtr(),
+ MVT::i16, St->getMemOperand());
+}
+
SDValue SystemZTargetLowering::lowerIS_FPCLASS(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
@@ -6346,6 +6431,16 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
return lowerAddrSpaceCast(Op, DAG);
case ISD::ROTL:
return lowerShift(Op, DAG, SystemZISD::VROTL_BY_SCALAR);
+ case ISD::FP_EXTEND:
+//case ISD::STRICT_FP_EXTEND:
+ return LowerFP_EXTEND(Op, DAG);
+ case ISD::FP_ROUND:
+//case ISD::STRICT_FP_ROUND:
+ return LowerFP_ROUND(Op, DAG);
+ case ISD::LOAD:
+ return lowerLoadF16(Op, DAG);
+ case ISD::STORE:
+ return lowerStoreF16(Op, DAG);
case ISD::IS_FPCLASS:
return lowerIS_FPCLASS(Op, DAG);
case ISD::GET_ROUNDING:
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index f67d4a27cc03348..36c5a78dd8027f5 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -447,7 +447,14 @@ class SystemZTargetLowering : public TargetLowering {
return TargetLowering::getNumRegisters(Context, VT);
}
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
- EVT VT) const override;
+ EVT VT) const override {
+ // 128-bit single-element vector types are passed like other vectors,
+ // not like their element type.
+ if (VT.isVector() && VT.getSizeInBits() == 128 &&
+ VT.getVectorNumElements() == 1)
+ return MVT::v16i8;
+ return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+ }
bool isCheapToSpeculateCtlz(Type *) const override { return true; }
bool isCheapToSpeculateCttz(Type *) const override { return true; }
bool preferZeroCompareBranch() const override { return true; }
@@ -469,8 +476,6 @@ class SystemZTargetLowering : public TargetLowering {
// LD, and having the full constant in memory enables reg/mem opcodes.
return VT != MVT::f64;
}
- bool softPromoteHalfType() const override { return true; }
- bool useFPRegsForHalfType() const override { return true; }
bool hasInlineStackProbe(const MachineFunction &MF) const override;
AtomicExpansionKind shouldCastAtomicLoadInIR(LoadInst *LI) const override;
AtomicExpansionKind shouldCastAtomicStoreInIR(StoreInst *SI) const override;
@@ -714,6 +719,11 @@ class SystemZTargetLowering : public TargetLowering {
SDValue lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
+ SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerLoadF16(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerStoreF16(SDValue Op, SelectionDAG &DAG) const;
+
SDValue lowerIS_FPCLASS(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
index aad04a2b4159cbf..aa9d429e271ced0 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -47,8 +47,11 @@ def LDR : UnaryRR <"ldr", 0x28, null_frag, FP64, FP64>;
def LXR : UnaryRRE<"lxr", 0xB365, null_frag, FP128, FP128>;
// For z13 we prefer LDR over LER to avoid partial register dependencies.
-let isCodeGenOnly = 1 in
- def LDR32 : UnaryRR<"ldr", 0x28, null_frag, FP32, FP32>;
+let isCodeGenOnly = 1 in {
+ def LER16 : UnaryRR <"ler", 0x38, null_frag, FP16, FP16>;
+ def LDR16 : UnaryRR<"ldr", 0x28, null_frag, FP16, FP16>;
+ def LDR32 : UnaryRR<"ldr", 0x28, null_frag, FP32, FP32>;
+}
// Moves between two floating-point registers that also set the condition
// codes. Note that these instructions will turn SNaNs into QNaNs and should
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 91db858f5cdaf61..f37826f8c1f5c94 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -967,6 +967,8 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
unsigned Opcode;
if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg))
Opcode = SystemZ::LGR;
+ else if (SystemZ::FP16BitRegClass.contains(DestReg, SrcReg))
+ Opcode = STI.hasVector() ? SystemZ::LDR16 : SystemZ::LER16;
else if (SystemZ::FP32BitRegClass.contains(DestReg, SrcReg))
// For z13 we prefer LDR over LER to avoid partial register dependencies.
Opcode = STI.hasVector() ? SystemZ::LDR32 : SystemZ::LER;
diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
index 8f9bb56f2eb3bbb..cb5147aca86af65 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -20,6 +20,7 @@ class SystemZRegWithSubregs<string n, list<Register> subregs>
}
let Namespace = "SystemZ" in {
+def subreg_h16 : SubRegIndex<16, 16>;
def subreg_l32 : SubRegIndex<32, 0>; // Also acts as subreg_hl32.
def subreg_h32 : SubRegIndex<32, 32>; // Also acts as subreg_hh32.
def subreg_l64 : SubRegIndex<64, 0>;
@@ -201,9 +202,16 @@ def F27Dwarf : DwarfMapping<81>;
def F29Dwarf : DwarfMapping<82>;
def F31Dwarf : DwarfMapping<83>;
+// Upper 16 bits of one of the floating-point registers
+class FPR16<bits<16> num, string n> : SystemZReg<n> {
+ let HWEncoding = num;
+}
+
// Upper 32 bits of one of the floating-point registers
-class FPR32<bits<16> num, string n> : SystemZReg<n> {
+class FPR32<bits<16> num, string n, FPR16 high>
+ : SystemZRegWithSubregs<n, [high]> {
let HWEncoding = num;
+ let SubRegIndices = [subreg_h16];
}
// One of the floating-point registers.
@@ -223,12 +231,14 @@ class FPR128<bits<16> num, string n, FPR64 low, FPR64 high>
// Floating-point registers. Registers 16-31 require the vector facility.
foreach I = 0-15 in {
- def F#I#S : FPR32<I, "f"#I>;
+ def F#I#H : FPR16<I, "f"#I>;
+ def F#I#S : FPR32<I, "f"#I, !cast<FPR16>("F"#I#"H")>;
def F#I#D : FPR64<I, "f"#I, !cast<FPR32>("F"#I#"S")>,
DwarfRegNum<[!cast<DwarfMapping>("F"#I#"Dwarf").Id]>;
}
foreach I = 16-31 in {
- def F#I#S : FPR32<I, "v"#I>;
+ def F#I#H : FPR16<I, "v"#I>;
+ def F#I#S : FPR32<I, "v"#I, !cast<FPR16>("F"#I#"H")>;
def F#I#D : FPR64<I, "v"#I, !cast<FPR32>("F"#I#"S")>,
DwarfRegNum<[!cast<DwarfMapping>("F"#I#"Dwarf").Id]>;
}
@@ -240,6 +250,7 @@ foreach I = [0, 1, 4, 5, 8, 9, 12, 13] in {
// There's no store-multiple instruction for FPRs, so we're not fussy
// about the order in which call-saved registers are allocated.
+defm FP16 : SystemZRegClass<"FP16", [f16], 16, (sequence "F%uH", 0, 15)>;
defm FP32 : SystemZRegClass<"FP32", [f32], 32, (sequence "F%uS", 0, 15)>;
defm FP64 : SystemZRegClass<"FP64", [f64], 64, (sequence "F%uD", 0, 15)>;
defm FP128 : SystemZRegClass<"FP128", [f128], 128,
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
index d0fec02777875ac..c36bd8d1dd30a76 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -777,8 +777,8 @@ def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>;
def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>;
// Load
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER(16)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R16|R32|GR)$")>;
def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>;
def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>;
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
index a6d89ce9443c5aa..ba08bd8077e12d7 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
@@ -797,8 +797,8 @@ def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>;
def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>;
// Load
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER(16)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R16|R32|GR)$")>;
def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>;
def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>;
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
index 455354e283ad8ec..71474b674dad802 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
@@ -815,8 +815,8 @@ def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>;
def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>;
// Load
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER(16)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R16|R32|GR)$")>;
def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>;
def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>;
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
index 92abf0ba4022cc2..c324b2f37641bc7 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
@@ -816,8 +816,8 @@ def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>;
def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>;
// Load
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER(16)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R16|R32|GR)$")>;
def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>;
def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>;
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
index 99d0d674bbbb2fd..2d09d489c8228e2 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -709,8 +709,8 @@ def : InstRW<[WLat1, FXU, NormalGr], (instregex "LZ(DR|ER)$")>;
def : InstRW<[WLat2, FXU2, GroupAlone2], (instregex "LZXR$")>;
// Load
-def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER$")>;
-def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER(16)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R16|R32|GR)$")>;
def : InstRW<[WLat3, FXU, NormalGr], (instregex "LGDR$")>;
def : InstRW<[WLat2, FXU2, GroupAlone2], (instregex "LXR$")>;
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td b/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
index 5b334da2bac3429..5af031ce73baade 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -747,8 +747,8 @@ def : InstRW<[WLat1, FXU, NormalGr], (instregex "LZ(DR|ER)$")>;
def : InstRW<[WLat2, FXU2, GroupAlone], (instregex "LZXR$")>;
// Load
-def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER$")>;
-def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER(16)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R16|R32|GR)$")>;
def : InstRW<[WLat3, FXU, NormalGr], (instregex "LGDR$")>;
def : InstRW<[WLat2, FXU2, GroupAlone], (instregex "LXR$")>;
diff --git a/llvm/test/CodeGen/SystemZ/fp-half.ll b/llvm/test/CodeGen/SystemZ/fp-half.ll
index 6458aae7055e412..cbc680f90d160f9 100644
--- a/llvm/test/CodeGen/SystemZ/fp-half.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-half.ll
@@ -1,107 +1,439 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=NOVEC
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=VECTOR
;
; Tests for 16-bit floating point (half).
+
; Incoming half arguments added together and returned.
define half @fun0(half %Op0, half %Op1) {
-; CHECK-LABEL: fun0:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
-; CHECK-NEXT: .cfi_offset %r13, -56
-; CHECK-NEXT: .cfi_offset %r14, -48
-; CHECK-NEXT: .cfi_offset %r15, -40
-; CHECK-NEXT: aghi %r15, -168
-; CHECK-NEXT: .cfi_def_cfa_offset 328
-; CHECK-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
-; CHECK-NEXT: .cfi_offset %f8, -168
-; CHECK-NEXT: vlgvf %r0, %v2, 0
-; CHECK-NEXT: llghr %r2, %r0
-; CHECK-NEXT: vlgvf %r13, %v0, 0
-; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
-; CHECK-NEXT: llghr %r2, %r13
-; CHECK-NEXT: ldr %f8, %f0
-; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
-; CHECK-NEXT: aebr %f0, %f8
-; CHECK-NEXT: brasl %r14, __gnu_f2h_ieee at PLT
-; CHECK-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
-; CHECK-NEXT: vlvgf %v0, %r2, 0
-; CHECK-NEXT: lmg %r13, %r15, 272(%r15)
-; CHECK-NEXT: br %r14
+; NOVEC-LABEL: fun0:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -176
+; NOVEC-NEXT: .cfi_def_cfa_offset 336
+; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: ler %f8, %f0
+; NOVEC-NEXT: ler %f0, %f2
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f9
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 288(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun0:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -176
+; VECTOR-NEXT: .cfi_def_cfa_offset 336
+; VECTOR-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: ldr %f0, %f2
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f9
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 288(%r15)
+; VECTOR-NEXT: br %r14
entry:
%Res = fadd half %Op0, %Op1
ret half %Res
}
+define half @fun1(half %Op0, half %Op1) {
+; NOVEC-LABEL: fun1:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -176
+; NOVEC-NEXT: .cfi_def_cfa_offset 336
+; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: ler %f8, %f2
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ldebr %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ldebr %f0, %f0
+; NOVEC-NEXT: adbr %f0, %f9
+; NOVEC-NEXT: ledbr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 288(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun1:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -176
+; VECTOR-NEXT: .cfi_def_cfa_offset 336
+; VECTOR-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: ldr %f8, %f2
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldebr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldebr %f0, %f0
+; VECTOR-NEXT: wfadb %f0, %f9, %f0
+; VECTOR-NEXT: ledbra %f0, 0, %f0, 0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 288(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %E0 = fpext half %Op0 to double
+ %E1 = fpext half %Op1 to double
+ %Add = fadd double %E0, %E1
+ %Res = fptrunc double %Add to half
+ ret half %Res
+}
+
+define half @fun2(half %Op0, half %Op1) {
+; NOVEC-LABEL: fun2:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -184
+; NOVEC-NEXT: .cfi_def_cfa_offset 344
+; NOVEC-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f11, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: .cfi_offset %f11, -184
+; NOVEC-NEXT: ler %f8, %f2
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: lxebr %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: lxebr %f0, %f0
+; NOVEC-NEXT: axbr %f0, %f9
+; NOVEC-NEXT: lexbr %f0, %f0
+; NOVEC-NEXT: # kill: def $f0s killed $f0s killed $f0q
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f11, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 296(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun2:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -184
+; VECTOR-NEXT: .cfi_def_cfa_offset 344
+; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: ldr %f8, %f2
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldebr %f0, %f0
+; VECTOR-NEXT: wflld %v0, %f0
+; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldebr %f0, %f0
+; VECTOR-NEXT: wflld %v0, %f0
+; VECTOR-NEXT: vl %v1, 160(%r15), 3 # 16-byte Folded Reload
+; VECTOR-NEXT: wfaxb %v0, %v1, %v0
+; VECTOR-NEXT: wflrx %f0, %v0, 0, 3
+; VECTOR-NEXT: ledbra %f0, 0, %f0, 0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 296(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %E0 = fpext half %Op0 to fp128
+ %E1 = fpext half %Op1 to fp128
+ %Add = fadd fp128 %E0, %E1
+ %Res = fptrunc fp128 %Add to half
+ ret half %Res
+}
+
; The half values are loaded and stored instead.
-define void @fun1(ptr %Op0, ptr %Op1, ptr %Dst) {
-; CHECK-LABEL: fun1:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: stmg %r12, %r15, 96(%r15)
-; CHECK-NEXT: .cfi_offset %r12, -64
-; CHECK-NEXT: .cfi_offset %r13, -56
-; CHECK-NEXT: .cfi_offset %r14, -48
-; CHECK-NEXT: .cfi_offset %r15, -40
-; CHECK-NEXT: aghi %r15, -168
-; CHECK-NEXT: .cfi_def_cfa_offset 328
-; CHECK-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
-; CHECK-NEXT: .cfi_offset %f8, -168
-; CHECK-NEXT: llgh %r12, 0(%r2)
-; CHECK-NEXT: llgh %r2, 0(%r3)
-; CHECK-NEXT: lgr %r13, %r4
-; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
-; CHECK-NEXT: lgr %r2, %r12
-; CHECK-NEXT: ldr %f8, %f0
-; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
-; CHECK-NEXT: aebr %f0, %f8
-; CHECK-NEXT: brasl %r14, __gnu_f2h_ieee at PLT
-; CHECK-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
-; CHECK-NEXT: sth %r2, 0(%r13)
-; CHECK-NEXT: lmg %r12, %r15, 264(%r15)
-; CHECK-NEXT: br %r14
+define void @fun3(ptr %Src, ptr %Dst) {
+; NOVEC-LABEL: fun3:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r13, %r15, 104(%r15)
+; NOVEC-NEXT: .cfi_offset %r13, -56
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -160
+; NOVEC-NEXT: .cfi_def_cfa_offset 320
+; NOVEC-NEXT: lh %r0, 0(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: lgr %r13, %r3
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r13)
+; NOVEC-NEXT: lmg %r13, %r15, 264(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun3:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: lh %r0, 0(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $f0s
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $f0s
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 0(%r13)
+; VECTOR-NEXT: lmg %r13, %r15, 264(%r15)
+; VECTOR-NEXT: br %r14
entry:
- %0 = load half, ptr %Op0, align 2
- %1 = load half, ptr %Op1, align 2
- %add = fadd half %0, %1
- store half %add, ptr %Dst, align 2
+ %Op0 = load half, ptr %Src, align 2
+ %Add = fadd half %Op0, %Op0
+ store half %Add, ptr %Dst, align 2
+ ret void
+}
+
+define void @fun4(ptr %Src, ptr %Dst) {
+; NOVEC-LABEL: fun4:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r13, %r15, 104(%r15)
+; NOVEC-NEXT: .cfi_offset %r13, -56
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -160
+; NOVEC-NEXT: .cfi_def_cfa_offset 320
+; NOVEC-NEXT: lh %r0, 0(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: lgr %r13, %r3
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ldebr %f0, %f0
+; NOVEC-NEXT: adbr %f0, %f0
+; NOVEC-NEXT: ledbr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r13)
+; NOVEC-NEXT: lmg %r13, %r15, 264(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun4:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: lh %r0, 0(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $f0s
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldebr %f0, %f0
+; VECTOR-NEXT: adbr %f0, %f0
+; VECTOR-NEXT: ledbra %f0, 0, %f0, 0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $f0s
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 0(%r13)
+; VECTOR-NEXT: lmg %r13, %r15, 264(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %Op0 = load half, ptr %Src, align 2
+ %E0 = fpext half %Op0 to double
+ %Add = fadd double %E0, %E0
+ %Res = fptrunc double %Add to half
+ store half %Res, ptr %Dst, align 2
+ ret void
+}
+
+define void @fun5(ptr %Src, ptr %Dst) {
+; NOVEC-LABEL: fun5:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r13, %r15, 104(%r15)
+; NOVEC-NEXT: .cfi_offset %r13, -56
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -160
+; NOVEC-NEXT: .cfi_def_cfa_offset 320
+; NOVEC-NEXT: lh %r0, 0(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: lgr %r13, %r3
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: lxebr %f0, %f0
+; NOVEC-NEXT: axbr %f0, %f0
+; NOVEC-NEXT: lexbr %f0, %f0
+; NOVEC-NEXT: # kill: def $f0s killed $f0s killed $f0q
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r13)
+; NOVEC-NEXT: lmg %r13, %r15, 264(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun5:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: lh %r0, 0(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $f0s
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldebr %f0, %f0
+; VECTOR-NEXT: wflld %v0, %f0
+; VECTOR-NEXT: wfaxb %v0, %v0, %v0
+; VECTOR-NEXT: wflrx %f0, %v0, 0, 3
+; VECTOR-NEXT: ledbra %f0, 0, %f0, 0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $f0s
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 0(%r13)
+; VECTOR-NEXT: lmg %r13, %r15, 264(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %Op0 = load half, ptr %Src, align 2
+ %E0 = fpext half %Op0 to fp128
+ %Add = fadd fp128 %E0, %E0
+ %Res = fptrunc fp128 %Add to half
+ store half %Res, ptr %Dst, align 2
ret void
}
; Test a chain of half operations which should have each operation surrounded
; by conversions to/from fp32 for proper emulation.
-define half @fun2(half %Op0, half %Op1, half %Op2) {
-; CHECK-LABEL: fun2:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: stmg %r12, %r15, 96(%r15)
-; CHECK-NEXT: .cfi_offset %r12, -64
-; CHECK-NEXT: .cfi_offset %r13, -56
-; CHECK-NEXT: .cfi_offset %r14, -48
-; CHECK-NEXT: .cfi_offset %r15, -40
-; CHECK-NEXT: aghi %r15, -168
-; CHECK-NEXT: .cfi_def_cfa_offset 328
-; CHECK-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
-; CHECK-NEXT: .cfi_offset %f8, -168
-; CHECK-NEXT: vlgvf %r0, %v2, 0
-; CHECK-NEXT: llghr %r2, %r0
-; CHECK-NEXT: vlgvf %r13, %v4, 0
-; CHECK-NEXT: vlgvf %r12, %v0, 0
-; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
-; CHECK-NEXT: llghr %r2, %r12
-; CHECK-NEXT: ldr %f8, %f0
-; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
-; CHECK-NEXT: aebr %f0, %f8
-; CHECK-NEXT: brasl %r14, __gnu_f2h_ieee at PLT
-; CHECK-NEXT: llghr %r2, %r2
-; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
-; CHECK-NEXT: llghr %r2, %r13
-; CHECK-NEXT: ldr %f8, %f0
-; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
-; CHECK-NEXT: wfasb %f0, %f8, %f0
-; CHECK-NEXT: brasl %r14, __gnu_f2h_ieee at PLT
-; CHECK-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
-; CHECK-NEXT: vlvgf %v0, %r2, 0
-; CHECK-NEXT: lmg %r12, %r15, 264(%r15)
-; CHECK-NEXT: br %r14
+define half @fun6(half %Op0, half %Op1, half %Op2) {
+; NOVEC-LABEL: fun6:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -184
+; NOVEC-NEXT: .cfi_def_cfa_offset 344
+; NOVEC-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f10, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: .cfi_offset %f10, -184
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f2
+; NOVEC-NEXT: ler %f8, %f4
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ler %f10, %f0
+; NOVEC-NEXT: ler %f0, %f9
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f10
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f9
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 296(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun6:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -184
+; VECTOR-NEXT: .cfi_def_cfa_offset 344
+; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 168(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f10, 160(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: .cfi_offset %f10, -184
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f2
+; VECTOR-NEXT: ldr %f8, %f4
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f10, %f0
+; VECTOR-NEXT: ldr %f0, %f9
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f10
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: wfasb %f0, %f9, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 296(%r15)
+; VECTOR-NEXT: br %r14
entry:
%A0 = fadd half %Op0, %Op1
%Res = fadd half %A0, %Op2
@@ -109,14 +441,32 @@ entry:
}
; Store an incoming half argument and return a loaded one.
-define half @fun3(half %Op0, ptr %Dst, ptr %Src) {
-; CHECK-LABEL: fun3:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vlgvf %r0, %v0, 0
-; CHECK-NEXT: sth %r0, 0(%r2)
-; CHECK-NEXT: lh %r0, 0(%r3)
-; CHECK-NEXT: vlvgf %v0, %r0, 0
-; CHECK-NEXT: br %r14
+define half @fun7(half %Op0, ptr %Dst, ptr %Src) {
+; NOVEC-LABEL: fun7:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r2)
+; NOVEC-NEXT: lh %r0, 0(%r3)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun7:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $f0s
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 0(%r2)
+; VECTOR-NEXT: lh %r0, 0(%r3)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $f0s
+; VECTOR-NEXT: br %r14
entry:
store half %Op0, ptr %Dst
%Res = load half, ptr %Src
@@ -125,23 +475,50 @@ entry:
; Call a function with half argument and return values.
declare half @foo(half)
-define void @fun4(ptr %Src, ptr %Dst) {
-; CHECK-LABEL: fun4:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
-; CHECK-NEXT: .cfi_offset %r13, -56
-; CHECK-NEXT: .cfi_offset %r14, -48
-; CHECK-NEXT: .cfi_offset %r15, -40
-; CHECK-NEXT: aghi %r15, -160
-; CHECK-NEXT: .cfi_def_cfa_offset 320
-; CHECK-NEXT: lh %r0, 0(%r2)
-; CHECK-NEXT: vlvgf %v0, %r0, 0
-; CHECK-NEXT: lgr %r13, %r3
-; CHECK-NEXT: brasl %r14, foo at PLT
-; CHECK-NEXT: vlgvf %r0, %v0, 0
-; CHECK-NEXT: sth %r0, 0(%r13)
-; CHECK-NEXT: lmg %r13, %r15, 264(%r15)
-; CHECK-NEXT: br %r14
+define void @fun8(ptr %Src, ptr %Dst) {
+; NOVEC-LABEL: fun8:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r13, %r15, 104(%r15)
+; NOVEC-NEXT: .cfi_offset %r13, -56
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -160
+; NOVEC-NEXT: .cfi_def_cfa_offset 320
+; NOVEC-NEXT: lh %r0, 0(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: lgr %r13, %r3
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: brasl %r14, foo at PLT
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r13)
+; NOVEC-NEXT: lmg %r13, %r15, 264(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun8:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: lh %r0, 0(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $f0s
+; VECTOR-NEXT: brasl %r14, foo at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $f0s
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 0(%r13)
+; VECTOR-NEXT: lmg %r13, %r15, 264(%r15)
+; VECTOR-NEXT: br %r14
entry:
%arg = load half, ptr %Src
%Res = call half @foo(half %arg)
@@ -150,51 +527,159 @@ entry:
}
; Receive stack argument.
-define half @bar(half %Arg0, half %Arg1, half %Arg2, half %Arg3, half %Arg4) {
-; CHECK-LABEL: bar:
-; CHECK: # %bb.0:
-; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
-; CHECK-NEXT: .cfi_offset %r13, -56
-; CHECK-NEXT: .cfi_offset %r14, -48
-; CHECK-NEXT: .cfi_offset %r15, -40
-; CHECK-NEXT: aghi %r15, -168
-; CHECK-NEXT: .cfi_def_cfa_offset 328
-; CHECK-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill
-; CHECK-NEXT: .cfi_offset %f8, -168
-; CHECK-NEXT: vlgvf %r0, %v6, 0
-; CHECK-NEXT: llgh %r13, 334(%r15)
-; CHECK-NEXT: llghr %r2, %r0
-; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
-; CHECK-NEXT: lgr %r2, %r13
-; CHECK-NEXT: ldr %f8, %f0
-; CHECK-NEXT: brasl %r14, __gnu_h2f_ieee at PLT
-; CHECK-NEXT: wfasb %f0, %f8, %f0
-; CHECK-NEXT: brasl %r14, __gnu_f2h_ieee at PLT
-; CHECK-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload
-; CHECK-NEXT: vlvgf %v0, %r2, 0
-; CHECK-NEXT: lmg %r13, %r15, 272(%r15)
-; CHECK-NEXT: br %r14
+define half @fun9(half %Arg0, half %Arg1, half %Arg2, half %Arg3, half %Arg4) {
+; NOVEC-LABEL: fun9:
+; NOVEC: # %bb.0:
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -176
+; NOVEC-NEXT: .cfi_def_cfa_offset 336
+; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: lh %r0, 342(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ler %f8, %f6
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f9
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 288(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun9:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -176
+; VECTOR-NEXT: .cfi_def_cfa_offset 336
+; VECTOR-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: lh %r0, 342(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: ldr %f8, %f6
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $f0s
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f9
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 288(%r15)
+; VECTOR-NEXT: br %r14
%A0 = fadd half %Arg3, %Arg4
ret half %A0
}
; Pass stack argument.
-define void @fun5() {
-; CHECK-LABEL: fun5:
-; CHECK: # %bb.0:
-; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
-; CHECK-NEXT: .cfi_offset %r14, -48
-; CHECK-NEXT: .cfi_offset %r15, -40
-; CHECK-NEXT: aghi %r15, -168
-; CHECK-NEXT: .cfi_def_cfa_offset 328
-; CHECK-NEXT: lzer %f0
-; CHECK-NEXT: ldr %f2, %f0
-; CHECK-NEXT: ldr %f4, %f0
-; CHECK-NEXT: ldr %f6, %f0
-; CHECK-NEXT: mvhi 164(%r15), 0
-; CHECK-NEXT: brasl %r14, bar at PLT
-; CHECK-NEXT: lmg %r14, %r15, 280(%r15)
-; CHECK-NEXT: br %r14
- call void @bar (half 0.0, half 0.0, half 0.0, half 0.0, half 0.0)
+define void @fun10(half %Arg0) {
+; NOVEC-LABEL: fun10:
+; NOVEC: # %bb.0:
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -168
+; NOVEC-NEXT: .cfi_def_cfa_offset 328
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: ler %f2, %f0
+; NOVEC-NEXT: ler %f4, %f0
+; NOVEC-NEXT: ler %f6, %f0
+; NOVEC-NEXT: sth %r0, 166(%r15)
+; NOVEC-NEXT: brasl %r14, fun9 at PLT
+; NOVEC-NEXT: lmg %r14, %r15, 280(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun10:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -168
+; VECTOR-NEXT: .cfi_def_cfa_offset 328
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $f0s
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: ldr %f2, %f0
+; VECTOR-NEXT: ldr %f4, %f0
+; VECTOR-NEXT: ldr %f6, %f0
+; VECTOR-NEXT: sth %r0, 166(%r15)
+; VECTOR-NEXT: brasl %r14, fun9 at PLT
+; VECTOR-NEXT: lmg %r14, %r15, 280(%r15)
+; VECTOR-NEXT: br %r14
+ call void @fun9(half %Arg0, half %Arg0, half %Arg0, half %Arg0, half %Arg0)
+ ret void
+}
+
+; Test loading some immediates from the Constant Pool.
+declare void @foo2(half, half, half)
+define void @fun11() {
+; NOVEC-LABEL: fun11:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -160
+; NOVEC-NEXT: .cfi_def_cfa_offset 320
+; NOVEC-NEXT: lhrl %r0, .LCPI11_0
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: lhrl %r0, .LCPI11_1
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f2, %r0
+; NOVEC-NEXT: # kill: def $f2h killed $f2h killed $f2d
+; NOVEC-NEXT: lhrl %r0, .LCPI11_2
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f4, %r0
+; NOVEC-NEXT: # kill: def $f4h killed $f4h killed $f4d
+; NOVEC-NEXT: brasl %r14, foo2 at PLT
+; NOVEC-NEXT: lmg %r14, %r15, 272(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun11:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -160
+; VECTOR-NEXT: .cfi_def_cfa_offset 320
+; VECTOR-NEXT: lhrl %r0, .LCPI11_0
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: lhrl %r0, .LCPI11_1
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v2, %r0, 0
+; VECTOR-NEXT: lhrl %r0, .LCPI11_2
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v4, %r0, 0
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $f0s
+; VECTOR-NEXT: # kill: def $f2h killed $f2h killed $f2s
+; VECTOR-NEXT: # kill: def $f4h killed $f4h killed $f4s
+; VECTOR-NEXT: brasl %r14, foo2 at PLT
+; VECTOR-NEXT: lmg %r14, %r15, 272(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ call half @foo2(half 0.0, half 1.0, half 0.375)
ret void
}
diff --git a/llvm/test/CodeGen/SystemZ/fp-round-03.ll b/llvm/test/CodeGen/SystemZ/fp-round-03.ll
index d35cafc406ad774..837a95705b5f852 100644
--- a/llvm/test/CodeGen/SystemZ/fp-round-03.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-round-03.ll
@@ -2,6 +2,18 @@
;
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+; Test that an f16 intrinsic can be lowered with promotion to float.
+declare half @llvm.rint.f16(half %f)
+define half @f0(half %f) {
+; CHECK-LABEL: f0:
+; CHECK: brasl %r14, __extendhfsf2 at PLT
+; CHECK: fiebra %f0, 0, %f0, 0
+; CHECK: brasl %r14, __truncsfhf2 at PLT
+; CHECK: br %r14
+ %res = call half @llvm.rint.f16(half %f)
+ ret half %res
+}
+
; Test rint for f32.
declare float @llvm.rint.f32(float %f)
define float @f1(float %f) {
diff --git a/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll b/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
index 4a38d7afba2c9df..36b3ef67c9e75a1 100644
--- a/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
@@ -3,6 +3,22 @@
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s --check-prefix=CHECK-OPT
; RUN: llc < %s -mtriple=s390x-linux-gnu -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK-OPT
+; Test f16 libcalls.
+define half @f0(half %x) {
+; CHECK-OPT-LABEL: f0:
+; CHECK-OPT-NOT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-OPT: brasl %r14, sinh at PLT
+; CHECK-OPT: brasl %r14, cosh at PLT
+; CHECK-OPT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-OPT: brasl %r14, __extendhfsf2 at PLT
+; CHECK-OPT: aebr %f0, %f8
+; CHECK-OPT: brasl %r14, __truncsfhf2 at PLT
+ %tmp1 = call half @sinh(half %x) readnone
+ %tmp2 = call half @cosh(half %x) readnone
+ %add = fadd half %tmp1, %tmp2
+ ret half %add
+}
+
define float @f1(float %x) {
; CHECK-OPT-LABEL: f1:
; CHECK-OPT: brasl %r14, sincosf at PLT
@@ -70,9 +86,11 @@ define fp128 @f3_errno(fp128 %x) {
ret fp128 %add
}
+declare half @sinh(half)
declare float @sinf(float)
declare double @sin(double)
declare fp128 @sinl(fp128)
+declare half @cosh(half)
declare float @cosf(float)
declare double @cos(double)
declare fp128 @cosl(fp128)
diff --git a/llvm/test/CodeGen/SystemZ/twoaddr-kill.mir b/llvm/test/CodeGen/SystemZ/twoaddr-kill.mir
index 7fc7bd3e347bb58..fb1448497c19043 100644
--- a/llvm/test/CodeGen/SystemZ/twoaddr-kill.mir
+++ b/llvm/test/CodeGen/SystemZ/twoaddr-kill.mir
@@ -18,19 +18,19 @@ body: |
; CHECK-NEXT: $r2l = COPY [[COPY]]
; CHECK-NEXT: $r3l = COPY killed [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]]:grh32bit = COPY killed [[COPY1]]
- ; CHECK-NEXT: INLINEASM &"stepa $1, $2, $3", 0 /* attdialect */, 393226 /* regdef:GRH32Bit */, def [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3), 9 /* reguse */, killed $r2l, 9 /* reguse */, killed $r3l
+ ; CHECK-NEXT: INLINEASM &"stepa $1, $2, $3", 0 /* attdialect */, 458762 /* regdef:GRH32Bit */, def [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3), 9 /* reguse */, killed $r2l, 9 /* reguse */, killed $r3l
; CHECK-NEXT: [[COPY3:%[0-9]+]]:grh32bit = COPY killed [[COPY2]]
; CHECK-NEXT: [[COPY4:%[0-9]+]]:grh32bit = COPY [[COPY3]]
- ; CHECK-NEXT: INLINEASM &"stepb $1, $2", 0 /* attdialect */, 393227 /* regdef-ec:GRH32Bit */, def early-clobber [[COPY4]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 393225 /* reguse:GRH32Bit */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"stepb $1, $2", 0 /* attdialect */, 458763 /* regdef-ec:GRH32Bit */, def early-clobber [[COPY4]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 458761 /* reguse:GRH32Bit */, [[COPY3]]
; CHECK-NEXT: $r2l = COPY killed [[COPY4]]
; CHECK-NEXT: Return implicit killed $r2l
%0:gr32bit = COPY killed $r2l
%2:grh32bit = COPY %0
$r2l = COPY %0
$r3l = COPY killed %0
- INLINEASM &"stepa $1, $2, $3", 0 /* attdialect */, 393226 /* regdef:GRH32Bit */, def %1:grh32bit, 2147483657 /* reguse tiedto:$0 */, killed %2(tied-def 3), 9 /* reguse */, killed $r2l, 9 /* reguse */, killed $r3l
+ INLINEASM &"stepa $1, $2, $3", 0 /* attdialect */, 458762 /* regdef:GRH32Bit */, def %1:grh32bit, 2147483657 /* reguse tiedto:$0 */, killed %2(tied-def 3), 9 /* reguse */, killed $r2l, 9 /* reguse */, killed $r3l
%4:grh32bit = COPY killed %1
- INLINEASM &"stepb $1, $2", 0 /* attdialect */, 393227 /* regdef-ec:GRH32Bit */, def early-clobber %3:grh32bit, 2147483657 /* reguse tiedto:$0 */, killed %4(tied-def 3), 393225 /* reguse:GRH32Bit */, %4
+ INLINEASM &"stepb $1, $2", 0 /* attdialect */, 458763 /* regdef-ec:GRH32Bit */, def early-clobber %3:grh32bit, 2147483657 /* reguse tiedto:$0 */, killed %4(tied-def 3), 458761 /* reguse:GRH32Bit */, %4
$r2l = COPY killed %3
Return implicit killed $r2l
...
>From 26660a60d7dcd9acaf435b2d0d03d58780b44696 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Tue, 5 Nov 2024 11:16:30 -0600
Subject: [PATCH 4/4] Atomic loads/stores, spill/reload, tests for __fp16 and
half vectors.
---
clang/lib/Basic/Targets/SystemZ.h | 4 +-
clang/lib/CodeGen/Targets/SystemZ.cpp | 2 +-
.../{fexcess-precision.c => Float16.c} | 0
clang/test/CodeGen/SystemZ/fp16.c | 32 +
clang/test/CodeGen/SystemZ/systemz-abi.c | 43 +
.../Target/SystemZ/SystemZISelLowering.cpp | 60 +-
llvm/lib/Target/SystemZ/SystemZISelLowering.h | 2 +
llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 46 +
llvm/test/CodeGen/SystemZ/atomic-load-10.ll | 16 +
llvm/test/CodeGen/SystemZ/atomic-store-10.ll | 17 +
llvm/test/CodeGen/SystemZ/fp-half-vector.ll | 1050 +++++++++++++++++
llvm/test/CodeGen/SystemZ/fp-half.ll | 3 +-
llvm/test/CodeGen/SystemZ/fp-sincos-01.ll | 2 +-
llvm/test/CodeGen/SystemZ/spill-half.mir | 39 +
14 files changed, 1297 insertions(+), 19 deletions(-)
rename clang/test/CodeGen/SystemZ/{fexcess-precision.c => Float16.c} (100%)
create mode 100644 clang/test/CodeGen/SystemZ/fp16.c
create mode 100644 llvm/test/CodeGen/SystemZ/atomic-load-10.ll
create mode 100644 llvm/test/CodeGen/SystemZ/atomic-store-10.ll
create mode 100644 llvm/test/CodeGen/SystemZ/fp-half-vector.ll
create mode 100644 llvm/test/CodeGen/SystemZ/spill-half.mir
diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h
index e23f9960948fd42..ab9b0d0be4998be 100644
--- a/clang/lib/Basic/Targets/SystemZ.h
+++ b/clang/lib/Basic/Targets/SystemZ.h
@@ -94,12 +94,12 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
// True if the backend supports operations on the half LLVM IR type.
// By setting this to false, conversions will happen for _Float16 around
- // a statement by default with operations done in float. However, if
+ // a statement by default, with operations done in float. However, if
// -ffloat16-excess-precision=none is given, no conversions will be made
// and instead the backend will promote each half operation to float
// individually.
HasLegalHalfType = false;
- // Allow half arguments and return values.
+ // Allow half arguments and return values (__fp16).
HalfArgsAndReturns = true;
// Support _Float16.
HasFloat16 = true;
diff --git a/clang/lib/CodeGen/Targets/SystemZ.cpp b/clang/lib/CodeGen/Targets/SystemZ.cpp
index 5e313c999240af7..84af39bba188891 100644
--- a/clang/lib/CodeGen/Targets/SystemZ.cpp
+++ b/clang/lib/CodeGen/Targets/SystemZ.cpp
@@ -185,7 +185,7 @@ bool SystemZABIInfo::isFPArgumentType(QualType Ty) const {
if (const BuiltinType *BT = Ty->getAs<BuiltinType>())
switch (BT->getKind()) {
-// case BuiltinType::Half: // __fp16 Support __fp16??
+ case BuiltinType::Half: // __fp16
case BuiltinType::Float16: // _Float16
case BuiltinType::Float:
case BuiltinType::Double:
diff --git a/clang/test/CodeGen/SystemZ/fexcess-precision.c b/clang/test/CodeGen/SystemZ/Float16.c
similarity index 100%
rename from clang/test/CodeGen/SystemZ/fexcess-precision.c
rename to clang/test/CodeGen/SystemZ/Float16.c
diff --git a/clang/test/CodeGen/SystemZ/fp16.c b/clang/test/CodeGen/SystemZ/fp16.c
new file mode 100644
index 000000000000000..52683424f6cc1da
--- /dev/null
+++ b/clang/test/CodeGen/SystemZ/fp16.c
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -triple s390x-linux-gnu -emit-llvm -o - %s \
+// RUN: | FileCheck %s
+
+__fp16 f(__fp16 a, __fp16 b, __fp16 c, __fp16 d) {
+ return a * b + c * d;
+}
+
+// CHECK-LABEL: define dso_local half @f(half noundef %a, half noundef %b, half noundef %c, half noundef %d) #0 {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: %a.addr = alloca half, align 2
+// CHECK-NEXT: %b.addr = alloca half, align 2
+// CHECK-NEXT: %c.addr = alloca half, align 2
+// CHECK-NEXT: %d.addr = alloca half, align 2
+// CHECK-NEXT: store half %a, ptr %a.addr, align 2
+// CHECK-NEXT: store half %b, ptr %b.addr, align 2
+// CHECK-NEXT: store half %c, ptr %c.addr, align 2
+// CHECK-NEXT: store half %d, ptr %d.addr, align 2
+// CHECK-NEXT: %0 = load half, ptr %a.addr, align 2
+// CHECK-NEXT: %conv = fpext half %0 to float
+// CHECK-NEXT: %1 = load half, ptr %b.addr, align 2
+// CHECK-NEXT: %conv1 = fpext half %1 to float
+// CHECK-NEXT: %mul = fmul float %conv, %conv1
+// CHECK-NEXT: %2 = load half, ptr %c.addr, align 2
+// CHECK-NEXT: %conv2 = fpext half %2 to float
+// CHECK-NEXT: %3 = load half, ptr %d.addr, align 2
+// CHECK-NEXT: %conv3 = fpext half %3 to float
+// CHECK-NEXT: %mul4 = fmul float %conv2, %conv3
+// CHECK-NEXT: %add = fadd float %mul, %mul4
+// CHECK-NEXT: %4 = fptrunc float %add to half
+// CHECK-NEXT: ret half %4
+// CHECK-NEXT: }
+
diff --git a/clang/test/CodeGen/SystemZ/systemz-abi.c b/clang/test/CodeGen/SystemZ/systemz-abi.c
index 2287126bdeabece..07c914f1fa0019e 100644
--- a/clang/test/CodeGen/SystemZ/systemz-abi.c
+++ b/clang/test/CodeGen/SystemZ/systemz-abi.c
@@ -45,6 +45,9 @@ long long pass_longlong(long long arg) { return arg; }
__int128 pass_int128(__int128 arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_int128(ptr dead_on_unwind noalias writable sret(i128) align 8 %{{.*}}, ptr %0)
+__fp16 pass___fp16(__fp16 arg) { return arg; }
+// CHECK-LABEL: define{{.*}} half @pass___fp16(half %{{.*}})
+
_Float16 pass__Float16(_Float16 arg) { return arg; }
// CHECK-LABEL: define{{.*}} half @pass__Float16(half %{{.*}})
@@ -75,6 +78,8 @@ _Complex long pass_complex_long(_Complex long arg) { return arg; }
_Complex long long pass_complex_longlong(_Complex long long arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_complex_longlong(ptr dead_on_unwind noalias writable sret({ i64, i64 }) align 8 %{{.*}}, ptr %{{.*}}arg)
+// _Complex __fp16 is (currently?) not allowed.
+
_Complex _Float16 pass_complex__Float16(_Complex _Float16 arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_complex__Float16(ptr dead_on_unwind noalias writable sret({ half, half }) align 2 %{{.*}}, ptr %{{.*}}arg)
@@ -129,6 +134,11 @@ struct agg_16byte pass_agg_16byte(struct agg_16byte arg) { return arg; }
// Float-like aggregate types
+struct agg___fp16 { __fp16 a; };
+struct agg___fp16 pass_agg___fp16(struct agg___fp16 arg) { return arg; }
+// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg___fp16(ptr dead_on_unwind noalias writable sret(%struct.agg___fp16) align 2 %{{.*}}, half %{{.*}})
+// SOFT-FLOAT-LABEL: define{{.*}} void @pass_agg___fp16(ptr dead_on_unwind noalias writable sret(%struct.agg___fp16) align 2 %{{.*}}, i16 noext %{{.*}})
+
struct agg__Float16 { _Float16 a; };
struct agg__Float16 pass_agg__Float16(struct agg__Float16 arg) { return arg; }
// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, half %{{.*}})
@@ -148,6 +158,11 @@ struct agg_longdouble { long double a; };
struct agg_longdouble pass_agg_longdouble(struct agg_longdouble arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_agg_longdouble(ptr dead_on_unwind noalias writable sret(%struct.agg_longdouble) align 8 %{{.*}}, ptr %{{.*}})
+struct agg___fp16_a8 { __fp16 a __attribute__((aligned (8))); };
+struct agg___fp16_a8 pass_agg___fp16_a8(struct agg___fp16_a8 arg) { return arg; }
+// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg___fp16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg___fp16_a8) align 8 %{{.*}}, double %{{.*}})
+// SOFT-FLOAT-LABEL: define{{.*}} void @pass_agg___fp16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg___fp16_a8) align 8 %{{.*}}, i64 %{{.*}})
+
struct agg__Float16_a8 { _Float16 a __attribute__((aligned (8))); };
struct agg__Float16_a8 pass_agg__Float16_a8(struct agg__Float16_a8 arg) { return arg; }
// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a8) align 8 %{{.*}}, double %{{.*}})
@@ -180,6 +195,10 @@ struct agg_nofloat3 pass_agg_nofloat3(struct agg_nofloat3 arg) { return arg; }
// Union types likewise are *not* float-like aggregate types
+union union___fp16 { __fp16 a; };
+union union___fp16 pass_union___fp16(union union___fp16 arg) { return arg; }
+// CHECK-LABEL: define{{.*}} void @pass_union___fp16(ptr dead_on_unwind noalias writable sret(%union.union___fp16) align 2 %{{.*}}, i16 noext %{{.*}})
+
union union__Float16 { _Float16 a; };
union union__Float16 pass_union__Float16(union union__Float16 arg) { return arg; }
// CHECK-LABEL: define{{.*}} void @pass_union__Float16(ptr dead_on_unwind noalias writable sret(%union.union__Float16) align 2 %{{.*}}, i16 noext %{{.*}})
@@ -461,6 +480,30 @@ struct agg_8byte va_agg_8byte(__builtin_va_list l) { return __builtin_va_arg(l,
// CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi ptr [ [[RAW_REG_ADDR]], %{{.*}} ], [ [[RAW_MEM_ADDR]], %{{.*}} ]
// CHECK: ret void
+struct agg___fp16 va_agg___fp16(__builtin_va_list l) { return __builtin_va_arg(l, struct agg___fp16); }
+// CHECK-LABEL: define{{.*}} void @va_agg___fp16(ptr dead_on_unwind noalias writable sret(%struct.agg___fp16) align 2 %{{.*}}, ptr %{{.*}}
+// HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 1
+// SOFT-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 0
+// CHECK: [[REG_COUNT:%[^ ]+]] = load i64, ptr [[REG_COUNT_PTR]]
+// HARD-FLOAT: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 4
+// SOFT-FLOAT: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5
+// CHECK: br i1 [[FITS_IN_REGS]],
+// CHECK: [[SCALED_REG_COUNT:%[^ ]+]] = mul i64 [[REG_COUNT]], 8
+// HARD-FLOAT: [[REG_OFFSET:%[^ ]+]] = add i64 [[SCALED_REG_COUNT]], 128
+// SOFT-FLOAT: [[REG_OFFSET:%[^ ]+]] = add i64 [[SCALED_REG_COUNT]], 22
+// CHECK: [[REG_SAVE_AREA_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 3
+// CHECK: [[REG_SAVE_AREA:%[^ ]+]] = load ptr, ptr [[REG_SAVE_AREA_PTR:[^ ]+]]
+// CHECK: [[RAW_REG_ADDR:%[^ ]+]] = getelementptr i8, ptr [[REG_SAVE_AREA]], i64 [[REG_OFFSET]]
+// CHECK: [[REG_COUNT1:%[^ ]+]] = add i64 [[REG_COUNT]], 1
+// CHECK: store i64 [[REG_COUNT1]], ptr [[REG_COUNT_PTR]]
+// CHECK: [[OVERFLOW_ARG_AREA_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 2
+// CHECK: [[OVERFLOW_ARG_AREA:%[^ ]+]] = load ptr, ptr [[OVERFLOW_ARG_AREA_PTR]]
+// CHECK: [[RAW_MEM_ADDR:%[^ ]+]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i64 6
+// CHECK: [[OVERFLOW_ARG_AREA2:%[^ ]+]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i64 8
+// CHECK: store ptr [[OVERFLOW_ARG_AREA2]], ptr [[OVERFLOW_ARG_AREA_PTR]]
+// CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi ptr [ [[RAW_REG_ADDR]], %{{.*}} ], [ [[RAW_MEM_ADDR]], %{{.*}} ]
+// CHECK: ret void
+
struct agg__Float16 va_agg__Float16(__builtin_va_list l) { return __builtin_va_arg(l, struct agg__Float16); }
// CHECK-LABEL: define{{.*}} void @va_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, ptr %{{.*}}
// HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 1
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index ecef68a90957226..2f6d6cd954cc533 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -523,7 +523,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setTruncStoreAction(VT, MVT::f16, Expand);
}
setOperationAction(ISD::LOAD, MVT::f16, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD, MVT::f16, Custom);
setOperationAction(ISD::STORE, MVT::f16, Custom);
+ setOperationAction(ISD::ATOMIC_STORE, MVT::f16, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
@@ -4596,6 +4598,22 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
return DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
}
+SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
+ SelectionDAG &DAG) const {
+ MVT RegVT = Op.getSimpleValueType();
+ if (RegVT.getSizeInBits() == 128)
+ return lowerATOMIC_LDST_I128(Op, DAG);
+ return lowerLoadF16(Op, DAG);
+}
+
+SDValue SystemZTargetLowering::lowerATOMIC_STORE(SDValue Op,
+ SelectionDAG &DAG) const {
+ auto *Node = cast<AtomicSDNode>(Op.getNode());
+ if (Node->getMemoryVT().getSizeInBits() == 128)
+ return lowerATOMIC_LDST_I128(Op, DAG);
+ return lowerStoreF16(Op, DAG);
+}
+
SDValue SystemZTargetLowering::lowerATOMIC_LDST_I128(SDValue Op,
SelectionDAG &DAG) const {
auto *Node = cast<AtomicSDNode>(Op.getNode());
@@ -6217,15 +6235,25 @@ SDValue SystemZTargetLowering::lowerLoadF16(SDValue Op,
MVT RegVT = Op.getSimpleValueType();
if (RegVT != MVT::f16)
return SDValue();
- LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
- SDLoc DL(Ld);
- assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending f16 load");
+
+ SDLoc DL(Op);
+ SDValue NewLd;
+ if (auto *AtomicLd = dyn_cast<AtomicSDNode>(Op.getNode())) {
+ assert(EVT(RegVT) == AtomicLd->getMemoryVT() && "Unhandled f16 load");
+ NewLd = DAG.getAtomic(ISD::ATOMIC_LOAD, DL, MVT::i16, MVT::i32,
+ AtomicLd->getChain(), AtomicLd->getBasePtr(),
+ AtomicLd->getMemOperand());
+ cast<AtomicSDNode>(NewLd)->setExtensionType(ISD::EXTLOAD);
+ } else {
+ LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
+ assert(EVT(RegVT) == Ld->getMemoryVT() && "Unhandled f16 load");
+ NewLd = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Ld->getChain(),
+ Ld->getBasePtr(), Ld->getPointerInfo(),
+ MVT::i16, Ld->getOriginalAlign(),
+ Ld->getMemOperand()->getFlags());
+ }
// Load as integer, shift and insert into upper 2 bytes of the FP register.
// TODO: Use VLEH if available.
- SDValue NewLd = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Ld->getChain(),
- Ld->getBasePtr(), Ld->getPointerInfo(),
- MVT::i16, Ld->getOriginalAlign(),
- Ld->getMemOperand()->getFlags());
SDValue Shft = DAG.getNode(ISD::SHL, DL, MVT::i32, NewLd,
DAG.getConstant(16, DL, MVT::i32));
SDValue BCast = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Shft);
@@ -6236,20 +6264,25 @@ SDValue SystemZTargetLowering::lowerLoadF16(SDValue Op,
SDValue SystemZTargetLowering::lowerStoreF16(SDValue Op,
SelectionDAG &DAG) const {
- StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
- SDLoc DL(St);
- SDValue StoredVal = St->getValue();
+ SDValue StoredVal = Op->getOperand(1);
MVT StoreVT = StoredVal.getSimpleValueType();
if (StoreVT != MVT::f16)
return SDValue();
- // Move into a GPR, shift and store the 2 bytes.
- // TODO: Use VSTEH if available.
+
+ // Move into a GPR, shift and store the 2 bytes. TODO: Use VSTEH if available.
+ SDLoc DL(Op);
SDNode *U32 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f32);
SDValue In32 = DAG.getTargetInsertSubreg(SystemZ::subreg_h16, DL,
MVT::f32, SDValue(U32, 0), StoredVal);
SDValue BCast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, In32);
SDValue Shft = DAG.getNode(ISD::SRL, DL, MVT::i32, BCast,
DAG.getConstant(16, DL, MVT::i32));
+
+ if (auto *AtomicSt = dyn_cast<AtomicSDNode>(Op.getNode()))
+ return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MVT::i16, AtomicSt->getChain(),
+ Shft, AtomicSt->getBasePtr(), AtomicSt->getMemOperand());
+
+ StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
return DAG.getTruncStore(St->getChain(), DL, Shft, St->getBasePtr(),
MVT::i16, St->getMemOperand());
}
@@ -6373,8 +6406,9 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
case ISD::ATOMIC_SWAP:
return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
case ISD::ATOMIC_STORE:
+ return lowerATOMIC_STORE(Op, DAG);
case ISD::ATOMIC_LOAD:
- return lowerATOMIC_LDST_I128(Op, DAG);
+ return lowerATOMIC_LOAD(Op, DAG);
case ISD::ATOMIC_LOAD_ADD:
return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD);
case ISD::ATOMIC_LOAD_SUB:
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 36c5a78dd8027f5..d3f700561c29ac5 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -698,6 +698,8 @@ class SystemZTargetLowering : public TargetLowering {
SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVECREDUCE_ADD(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerATOMIC_LDST_I128(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG,
unsigned Opcode) const;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index f37826f8c1f5c94..cf7f5d9dd61a95d 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -995,8 +995,31 @@ void SystemZInstrInfo::storeRegToStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
bool isKill, int FrameIdx, const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI, Register VReg) const {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+ // There are no fp16 load/store instructions, so need to save/restore via
+ // GPR (TODO: Use VSTEH in case of vector support).
+ if (RC == &SystemZ::FP16BitRegClass) {
+ assert(!MRI.isSSA() && MRI.getNumVirtRegs() &&
+ "Expected non-SSA form with virtual registers.");
+ Register GR64Reg = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
+ Register FP64Reg = MRI.createVirtualRegister(&SystemZ::FP64BitRegClass);
+ BuildMI(MBB, MBBI, DL, get(SystemZ::COPY))
+ .addReg(FP64Reg, RegState::DefineNoRead, SystemZ::subreg_h16)
+ .addReg(SrcReg, getKillRegState(isKill));
+ BuildMI(MBB, MBBI, DL, get(SystemZ::LGDR), GR64Reg)
+ .addReg(FP64Reg, RegState::Kill);
+ BuildMI(MBB, MBBI, DL, get(SystemZ::SRLG), GR64Reg)
+ .addReg(GR64Reg)
+ .addReg(0)
+ .addImm(48);
+ addFrameReference(BuildMI(MBB, MBBI, DL, get(SystemZ::STH))
+ .addReg(GR64Reg, RegState::Kill, SystemZ::subreg_l32),
+ FrameIdx);
+ return;
+ }
+
// Callers may expect a single instruction, so keep 128-bit moves
// together for now and lower them after register allocation.
unsigned LoadOpcode, StoreOpcode;
@@ -1012,8 +1035,31 @@ void SystemZInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI,
Register VReg) const {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+ // There are no fp16 load/store instructions, so need to save/restore via
+ // GPR (TODO: Use VLEH in case of vector support).
+ if (RC == &SystemZ::FP16BitRegClass) {
+ assert(!MRI.isSSA() && MRI.getNumVirtRegs() &&
+ "Expected non-SSA form with virtual registers.");
+ Register GR64Reg = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
+ Register FP64Reg = MRI.createVirtualRegister(&SystemZ::FP64BitRegClass);
+ addFrameReference(BuildMI(MBB, MBBI, DL, get(SystemZ::LH))
+ .addReg(GR64Reg, RegState::DefineNoRead,
+ SystemZ::subreg_l32),
+ FrameIdx);
+ BuildMI(MBB, MBBI, DL, get(SystemZ::SLLG), GR64Reg)
+ .addReg(GR64Reg)
+ .addReg(0)
+ .addImm(48);
+ BuildMI(MBB, MBBI, DL, get(SystemZ::LDGR), FP64Reg)
+ .addReg(GR64Reg, RegState::Kill);
+ BuildMI(MBB, MBBI, DL, get(SystemZ::COPY), DestReg)
+ .addReg(FP64Reg, RegState::Kill, SystemZ::subreg_h16);
+ return;
+ }
+
// Callers may expect a single instruction, so keep 128-bit moves
// together for now and lower them after register allocation.
unsigned LoadOpcode, StoreOpcode;
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-10.ll b/llvm/test/CodeGen/SystemZ/atomic-load-10.ll
new file mode 100644
index 000000000000000..077cae9c5432819
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-10.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test fp16 atomic loads.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define half @f1(ptr %src) {
+; CHECK-LABEL: f1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lh %r0, 0(%r2)
+; CHECK-NEXT: sllg %r0, %r0, 48
+; CHECK-NEXT: ldgr %f0, %r0
+; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; CHECK-NEXT: br %r14
+ %val = load atomic half, ptr %src seq_cst, align 2
+ ret half %val
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-10.ll b/llvm/test/CodeGen/SystemZ/atomic-store-10.ll
new file mode 100644
index 000000000000000..3fa5ce0a88bae5d
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-10.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test half atomic stores.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define void @f1(ptr %src, half %val) {
+; CHECK-LABEL: f1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d
+; CHECK-NEXT: lgdr %r0, %f0
+; CHECK-NEXT: srlg %r0, %r0, 48
+; CHECK-NEXT: sth %r0, 0(%r2)
+; CHECK-NEXT: bcr 15, %r0
+; CHECK-NEXT: br %r14
+ store atomic half %val, ptr %src seq_cst, align 2
+ ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector.ll
new file mode 100644
index 000000000000000..ed405ce82c1cd26
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-half-vector.ll
@@ -0,0 +1,1050 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=NOVEC
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefix=VECTOR
+
+; Add the <8 x half> argument with itself and return it.
+define <8 x half> @fun0(<8 x half> %Op) {
+; NOVEC-LABEL: fun0:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r13, %r15, 104(%r15)
+; NOVEC-NEXT: .cfi_offset %r13, -56
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -224
+; NOVEC-NEXT: .cfi_def_cfa_offset 384
+; NOVEC-NEXT: std %f8, 216(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 208(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f10, 200(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f11, 192(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f12, 184(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f13, 176(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f14, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f15, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: .cfi_offset %f10, -184
+; NOVEC-NEXT: .cfi_offset %f11, -192
+; NOVEC-NEXT: .cfi_offset %f12, -200
+; NOVEC-NEXT: .cfi_offset %f13, -208
+; NOVEC-NEXT: .cfi_offset %f14, -216
+; NOVEC-NEXT: .cfi_offset %f15, -224
+; NOVEC-NEXT: lh %r0, 414(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f15, %r0
+; NOVEC-NEXT: lh %r0, 406(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f12, %r0
+; NOVEC-NEXT: lh %r0, 398(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f9, %r0
+; NOVEC-NEXT: lh %r0, 390(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ler %f10, %f6
+; NOVEC-NEXT: ler %f11, %f4
+; NOVEC-NEXT: ler %f13, %f2
+; NOVEC-NEXT: ler %f14, %f0
+; NOVEC-NEXT: lgr %r13, %r2
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f8, %f0
+; NOVEC-NEXT: ler %f0, %f9
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f12
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f12, %f0
+; NOVEC-NEXT: ler %f0, %f15
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f15, %f0
+; NOVEC-NEXT: ler %f0, %f14
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f14, %f0
+; NOVEC-NEXT: ler %f0, %f13
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f13, %f0
+; NOVEC-NEXT: ler %f0, %f11
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f11, %f0
+; NOVEC-NEXT: ler %f0, %f10
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 6(%r13)
+; NOVEC-NEXT: lgdr %r0, %f11
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 4(%r13)
+; NOVEC-NEXT: lgdr %r0, %f13
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 2(%r13)
+; NOVEC-NEXT: lgdr %r0, %f14
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r13)
+; NOVEC-NEXT: lgdr %r0, %f15
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 14(%r13)
+; NOVEC-NEXT: lgdr %r0, %f12
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 12(%r13)
+; NOVEC-NEXT: lgdr %r0, %f9
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 10(%r13)
+; NOVEC-NEXT: lgdr %r0, %f8
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 8(%r13)
+; NOVEC-NEXT: ld %f8, 216(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 208(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f10, 200(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f11, 192(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f12, 184(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f13, 176(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f14, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f15, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r13, %r15, 328(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun0:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -224
+; VECTOR-NEXT: .cfi_def_cfa_offset 384
+; VECTOR-NEXT: std %f8, 216(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 208(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f10, 200(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f11, 192(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f12, 184(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f13, 176(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f14, 168(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f15, 160(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: .cfi_offset %f10, -184
+; VECTOR-NEXT: .cfi_offset %f11, -192
+; VECTOR-NEXT: .cfi_offset %f12, -200
+; VECTOR-NEXT: .cfi_offset %f13, -208
+; VECTOR-NEXT: .cfi_offset %f14, -216
+; VECTOR-NEXT: .cfi_offset %f15, -224
+; VECTOR-NEXT: lh %r0, 414(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v14, %r0, 0
+; VECTOR-NEXT: lh %r0, 406(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v12, %r0, 0
+; VECTOR-NEXT: lh %r0, 398(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v9, %r0, 0
+; VECTOR-NEXT: lh %r0, 390(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: ldr %f10, %f6
+; VECTOR-NEXT: ldr %f11, %f4
+; VECTOR-NEXT: ldr %f13, %f2
+; VECTOR-NEXT: lgr %r13, %r2
+; VECTOR-NEXT: ldr %f15, %f0
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $f0s
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: ldr %f0, %f9
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f12
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f12, %f0
+; VECTOR-NEXT: ldr %f0, %f14
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f14, %f0
+; VECTOR-NEXT: ldr %f0, %f15
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f15, %f0
+; VECTOR-NEXT: ldr %f0, %f13
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f13, %f0
+; VECTOR-NEXT: ldr %f0, %f11
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f11, %f0
+; VECTOR-NEXT: ldr %f0, %f10
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: # kill: def $f0h killed $f0h def $f0s
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 6(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v11, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 4(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v13, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 2(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v15, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 0(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v14, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 14(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v12, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 12(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v9, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 10(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v8, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 8(%r13)
+; VECTOR-NEXT: ld %f8, 216(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 208(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f10, 200(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f11, 192(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f12, 184(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f13, 176(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f14, 168(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f15, 160(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r13, %r15, 328(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %Res = fadd <8 x half> %Op, %Op
+ ret <8 x half> %Res
+}
+
+; Same, but with partial vector values.
+define <4 x half> @fun1(<4 x half> %Op) {
+; NOVEC-LABEL: fun1:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -192
+; NOVEC-NEXT: .cfi_def_cfa_offset 352
+; NOVEC-NEXT: std %f8, 184(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 176(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f10, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f11, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: .cfi_offset %f10, -184
+; NOVEC-NEXT: .cfi_offset %f11, -192
+; NOVEC-NEXT: ler %f8, %f6
+; NOVEC-NEXT: ler %f9, %f4
+; NOVEC-NEXT: ler %f10, %f2
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f11, %f0
+; NOVEC-NEXT: ler %f0, %f10
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f10, %f0
+; NOVEC-NEXT: ler %f0, %f9
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: aebr %f0, %f0
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f6, %f0
+; NOVEC-NEXT: ler %f0, %f11
+; NOVEC-NEXT: ler %f2, %f10
+; NOVEC-NEXT: ler %f4, %f9
+; NOVEC-NEXT: ld %f8, 184(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 176(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f10, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f11, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 304(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun1:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -192
+; VECTOR-NEXT: .cfi_def_cfa_offset 352
+; VECTOR-NEXT: std %f8, 184(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 176(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f10, 168(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f11, 160(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: .cfi_offset %f10, -184
+; VECTOR-NEXT: .cfi_offset %f11, -192
+; VECTOR-NEXT: ldr %f8, %f6
+; VECTOR-NEXT: ldr %f9, %f4
+; VECTOR-NEXT: ldr %f10, %f2
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f11, %f0
+; VECTOR-NEXT: ldr %f0, %f10
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f10, %f0
+; VECTOR-NEXT: ldr %f0, %f9
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f9, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: aebr %f0, %f0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f6, %f0
+; VECTOR-NEXT: ldr %f0, %f11
+; VECTOR-NEXT: ldr %f2, %f10
+; VECTOR-NEXT: ldr %f4, %f9
+; VECTOR-NEXT: ld %f8, 184(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 176(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f10, 168(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f11, 160(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 304(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %Res = fadd <4 x half> %Op, %Op
+ ret <4 x half> %Res
+}
+
+; Test a vector extension.
+define <2 x half> @fun2(<2 x half> %Op) {
+; NOVEC-LABEL: fun2:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -184
+; NOVEC-NEXT: .cfi_def_cfa_offset 344
+; NOVEC-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 168(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f10, 160(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: .cfi_offset %f10, -184
+; NOVEC-NEXT: ler %f8, %f2
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: ldebr %f9, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
+; NOVEC-NEXT: adbr %f9, %f9
+; NOVEC-NEXT: ldebr %f10, %f0
+; NOVEC-NEXT: ledbr %f0, %f9
+; NOVEC-NEXT: adbr %f10, %f10
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f8, %f0
+; NOVEC-NEXT: ledbr %f0, %f10
+; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
+; NOVEC-NEXT: ler %f2, %f0
+; NOVEC-NEXT: ler %f0, %f8
+; NOVEC-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 296(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun2:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -184
+; VECTOR-NEXT: .cfi_def_cfa_offset 344
+; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: ldr %f0, %f2
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldebr %f0, %f0
+; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: brasl %r14, __extendhfsf2 at PLT
+; VECTOR-NEXT: ldebr %f0, %f0
+; VECTOR-NEXT: vl %v1, 160(%r15), 3 # 16-byte Folded Reload
+; VECTOR-NEXT: vmrhg %v0, %v0, %v1
+; VECTOR-NEXT: vfadb %v0, %v0, %v0
+; VECTOR-NEXT: vledb %v0, %v0, 0, 0
+; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill
+; VECTOR-NEXT: # kill: def $f0s killed $f0s killed $v0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f8, %f0
+; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Folded Reload
+; VECTOR-NEXT: vrepf %v0, %v0, 2
+; VECTOR-NEXT: # kill: def $f0s killed $f0s killed $v0
+; VECTOR-NEXT: brasl %r14, __truncsfhf2 at PLT
+; VECTOR-NEXT: ldr %f2, %f0
+; VECTOR-NEXT: ldr %f0, %f8
+; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 296(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %E = fpext <2 x half> %Op to <2 x double>
+ %Add = fadd <2 x double> %E, %E
+ %Res = fptrunc <2 x double> %Add to <2 x half>
+ ret <2 x half> %Res
+}
+
+; Load and store an <8 x half> vector.
+define void @fun3(ptr %Src, ptr %Dst) {
+; NOVEC-LABEL: fun3:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: lh %r0, 0(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: lh %r0, 2(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f1, %r0
+; NOVEC-NEXT: lh %r0, 4(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f2, %r0
+; NOVEC-NEXT: lh %r0, 6(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f3, %r0
+; NOVEC-NEXT: lh %r0, 8(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f4, %r0
+; NOVEC-NEXT: lh %r0, 10(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f5, %r0
+; NOVEC-NEXT: lh %r0, 12(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f6, %r0
+; NOVEC-NEXT: lh %r0, 14(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f7, %r0
+; NOVEC-NEXT: lgdr %r0, %f7
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 14(%r3)
+; NOVEC-NEXT: lgdr %r0, %f6
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 12(%r3)
+; NOVEC-NEXT: lgdr %r0, %f5
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 10(%r3)
+; NOVEC-NEXT: lgdr %r0, %f4
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 8(%r3)
+; NOVEC-NEXT: lgdr %r0, %f3
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 6(%r3)
+; NOVEC-NEXT: lgdr %r0, %f2
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 4(%r3)
+; NOVEC-NEXT: lgdr %r0, %f1
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 2(%r3)
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r3)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun3:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: lh %r0, 0(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: lh %r0, 2(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v1, %r0, 0
+; VECTOR-NEXT: lh %r0, 4(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v2, %r0, 0
+; VECTOR-NEXT: lh %r0, 6(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v3, %r0, 0
+; VECTOR-NEXT: lh %r0, 8(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v4, %r0, 0
+; VECTOR-NEXT: lh %r0, 10(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v5, %r0, 0
+; VECTOR-NEXT: lh %r0, 12(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v6, %r0, 0
+; VECTOR-NEXT: lh %r0, 14(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v7, %r0, 0
+; VECTOR-NEXT: vlgvf %r0, %v7, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 14(%r3)
+; VECTOR-NEXT: vlgvf %r0, %v6, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 12(%r3)
+; VECTOR-NEXT: vlgvf %r0, %v5, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 10(%r3)
+; VECTOR-NEXT: vlgvf %r0, %v4, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 8(%r3)
+; VECTOR-NEXT: vlgvf %r0, %v3, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 6(%r3)
+; VECTOR-NEXT: vlgvf %r0, %v2, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 4(%r3)
+; VECTOR-NEXT: vlgvf %r0, %v1, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 2(%r3)
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 0(%r3)
+; VECTOR-NEXT: br %r14
+entry:
+ %L = load <8 x half>, ptr %Src
+ store <8 x half> %L, ptr %Dst
+ ret void
+}
+
+; Call a function with <8 x half> argument and return values.
+declare <8 x half> @foo(<8 x half>)
+define void @fun4(ptr %Src, ptr %Dst) {
+; NOVEC-LABEL: fun4:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r13, %r15, 104(%r15)
+; NOVEC-NEXT: .cfi_offset %r13, -56
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -208
+; NOVEC-NEXT: .cfi_def_cfa_offset 368
+; NOVEC-NEXT: lh %r0, 0(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: lh %r0, 2(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f2, %r0
+; NOVEC-NEXT: # kill: def $f2h killed $f2h killed $f2d
+; NOVEC-NEXT: lh %r0, 4(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f4, %r0
+; NOVEC-NEXT: # kill: def $f4h killed $f4h killed $f4d
+; NOVEC-NEXT: lh %r0, 6(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f6, %r0
+; NOVEC-NEXT: # kill: def $f6h killed $f6h killed $f6d
+; NOVEC-NEXT: lh %r0, 8(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f1, %r0
+; NOVEC-NEXT: lh %r0, 10(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f3, %r0
+; NOVEC-NEXT: lh %r0, 12(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f5, %r0
+; NOVEC-NEXT: lh %r0, 14(%r2)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f7, %r0
+; NOVEC-NEXT: lgdr %r0, %f7
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 190(%r15)
+; NOVEC-NEXT: lgdr %r0, %f5
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 182(%r15)
+; NOVEC-NEXT: lgdr %r0, %f3
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 174(%r15)
+; NOVEC-NEXT: lgdr %r0, %f1
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: la %r2, 192(%r15)
+; NOVEC-NEXT: lgr %r13, %r3
+; NOVEC-NEXT: sth %r0, 166(%r15)
+; NOVEC-NEXT: brasl %r14, foo at PLT
+; NOVEC-NEXT: lh %r0, 192(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: lh %r0, 194(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f1, %r0
+; NOVEC-NEXT: lh %r0, 196(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f2, %r0
+; NOVEC-NEXT: lh %r0, 198(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f3, %r0
+; NOVEC-NEXT: lh %r0, 200(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f4, %r0
+; NOVEC-NEXT: lh %r0, 202(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f5, %r0
+; NOVEC-NEXT: lh %r0, 204(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f6, %r0
+; NOVEC-NEXT: lh %r0, 206(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f7, %r0
+; NOVEC-NEXT: lgdr %r0, %f7
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 14(%r13)
+; NOVEC-NEXT: lgdr %r0, %f6
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 12(%r13)
+; NOVEC-NEXT: lgdr %r0, %f5
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 10(%r13)
+; NOVEC-NEXT: lgdr %r0, %f4
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 8(%r13)
+; NOVEC-NEXT: lgdr %r0, %f3
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 6(%r13)
+; NOVEC-NEXT: lgdr %r0, %f2
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 4(%r13)
+; NOVEC-NEXT: lgdr %r0, %f1
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 2(%r13)
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 0(%r13)
+; NOVEC-NEXT: lmg %r13, %r15, 312(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun4:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r13, %r15, 104(%r15)
+; VECTOR-NEXT: .cfi_offset %r13, -56
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -208
+; VECTOR-NEXT: .cfi_def_cfa_offset 368
+; VECTOR-NEXT: lh %r0, 0(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: lh %r0, 2(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v2, %r0, 0
+; VECTOR-NEXT: lh %r0, 4(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v4, %r0, 0
+; VECTOR-NEXT: lh %r0, 6(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v6, %r0, 0
+; VECTOR-NEXT: lh %r0, 8(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v1, %r0, 0
+; VECTOR-NEXT: lh %r0, 10(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v3, %r0, 0
+; VECTOR-NEXT: lh %r0, 12(%r2)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v5, %r0, 0
+; VECTOR-NEXT: lh %r0, 14(%r2)
+; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $f0s
+; VECTOR-NEXT: # kill: def $f2h killed $f2h killed $f2s
+; VECTOR-NEXT: # kill: def $f4h killed $f4h killed $f4s
+; VECTOR-NEXT: # kill: def $f6h killed $f6h killed $f6s
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v7, %r0, 0
+; VECTOR-NEXT: vlgvf %r0, %v7, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 190(%r15)
+; VECTOR-NEXT: vlgvf %r0, %v5, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 182(%r15)
+; VECTOR-NEXT: vlgvf %r0, %v3, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 174(%r15)
+; VECTOR-NEXT: vlgvf %r0, %v1, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 166(%r15)
+; VECTOR-NEXT: la %r2, 192(%r15)
+; VECTOR-NEXT: lgr %r13, %r3
+; VECTOR-NEXT: brasl %r14, foo at PLT
+; VECTOR-NEXT: lh %r0, 192(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: lh %r0, 194(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v1, %r0, 0
+; VECTOR-NEXT: lh %r0, 196(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v2, %r0, 0
+; VECTOR-NEXT: lh %r0, 198(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v3, %r0, 0
+; VECTOR-NEXT: lh %r0, 200(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v4, %r0, 0
+; VECTOR-NEXT: lh %r0, 202(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v5, %r0, 0
+; VECTOR-NEXT: lh %r0, 204(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v6, %r0, 0
+; VECTOR-NEXT: lh %r0, 206(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v7, %r0, 0
+; VECTOR-NEXT: vlgvf %r0, %v7, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 14(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v6, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 12(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v5, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 10(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v4, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 8(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v3, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 6(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v2, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 4(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v1, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 2(%r13)
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 0(%r13)
+; VECTOR-NEXT: lmg %r13, %r15, 312(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ %arg = load <8 x half>, ptr %Src
+ %Res = call <8 x half> @foo(<8 x half> %arg)
+ store <8 x half> %Res, ptr %Dst
+ ret void
+}
+
+; Receive and pass argument fully on stack.
+declare void @foo2(<4 x half> %dummy, <8 x half> %Arg5)
+define void @fun5(<4 x half> %dummy, <8 x half> %Arg5) {
+; NOVEC-LABEL: fun5:
+; NOVEC: # %bb.0:
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -256
+; NOVEC-NEXT: .cfi_def_cfa_offset 416
+; NOVEC-NEXT: std %f8, 248(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f9, 240(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f10, 232(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: std %f11, 224(%r15) # 8-byte Folded Spill
+; NOVEC-NEXT: .cfi_offset %f8, -168
+; NOVEC-NEXT: .cfi_offset %f9, -176
+; NOVEC-NEXT: .cfi_offset %f10, -184
+; NOVEC-NEXT: .cfi_offset %f11, -192
+; NOVEC-NEXT: lh %r0, 422(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f1, %r0
+; NOVEC-NEXT: lh %r0, 430(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f3, %r0
+; NOVEC-NEXT: lh %r0, 438(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f5, %r0
+; NOVEC-NEXT: lh %r0, 446(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f7, %r0
+; NOVEC-NEXT: lh %r0, 454(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f8, %r0
+; NOVEC-NEXT: lh %r0, 462(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f9, %r0
+; NOVEC-NEXT: lh %r0, 470(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f10, %r0
+; NOVEC-NEXT: lh %r0, 478(%r15)
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f11, %r0
+; NOVEC-NEXT: lgdr %r0, %f11
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 222(%r15)
+; NOVEC-NEXT: lgdr %r0, %f10
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 214(%r15)
+; NOVEC-NEXT: lgdr %r0, %f9
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 206(%r15)
+; NOVEC-NEXT: lgdr %r0, %f8
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 198(%r15)
+; NOVEC-NEXT: lgdr %r0, %f7
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 190(%r15)
+; NOVEC-NEXT: lgdr %r0, %f5
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 182(%r15)
+; NOVEC-NEXT: lgdr %r0, %f3
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 174(%r15)
+; NOVEC-NEXT: lgdr %r0, %f1
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 166(%r15)
+; NOVEC-NEXT: brasl %r14, foo2 at PLT
+; NOVEC-NEXT: ld %f8, 248(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f9, 240(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f10, 232(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: ld %f11, 224(%r15) # 8-byte Folded Reload
+; NOVEC-NEXT: lmg %r14, %r15, 368(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun5:
+; VECTOR: # %bb.0:
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -256
+; VECTOR-NEXT: .cfi_def_cfa_offset 416
+; VECTOR-NEXT: std %f8, 248(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f9, 240(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f10, 232(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: std %f11, 224(%r15) # 8-byte Folded Spill
+; VECTOR-NEXT: .cfi_offset %f8, -168
+; VECTOR-NEXT: .cfi_offset %f9, -176
+; VECTOR-NEXT: .cfi_offset %f10, -184
+; VECTOR-NEXT: .cfi_offset %f11, -192
+; VECTOR-NEXT: lh %r0, 422(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v1, %r0, 0
+; VECTOR-NEXT: lh %r0, 430(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v3, %r0, 0
+; VECTOR-NEXT: lh %r0, 438(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v5, %r0, 0
+; VECTOR-NEXT: lh %r0, 446(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v7, %r0, 0
+; VECTOR-NEXT: lh %r0, 454(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v8, %r0, 0
+; VECTOR-NEXT: lh %r0, 462(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v9, %r0, 0
+; VECTOR-NEXT: lh %r0, 470(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v10, %r0, 0
+; VECTOR-NEXT: lh %r0, 478(%r15)
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v11, %r0, 0
+; VECTOR-NEXT: vlgvf %r0, %v11, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 222(%r15)
+; VECTOR-NEXT: vlgvf %r0, %v10, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 214(%r15)
+; VECTOR-NEXT: vlgvf %r0, %v9, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 206(%r15)
+; VECTOR-NEXT: vlgvf %r0, %v8, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 198(%r15)
+; VECTOR-NEXT: vlgvf %r0, %v7, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 190(%r15)
+; VECTOR-NEXT: vlgvf %r0, %v5, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 182(%r15)
+; VECTOR-NEXT: vlgvf %r0, %v3, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 174(%r15)
+; VECTOR-NEXT: vlgvf %r0, %v1, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 166(%r15)
+; VECTOR-NEXT: brasl %r14, foo2 at PLT
+; VECTOR-NEXT: ld %f8, 248(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f9, 240(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f10, 232(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: ld %f11, 224(%r15) # 8-byte Folded Reload
+; VECTOR-NEXT: lmg %r14, %r15, 368(%r15)
+; VECTOR-NEXT: br %r14
+ call void @foo2(<4 x half> %dummy, <8 x half> %Arg5)
+ ret void
+}
+
+; Test loading vector constants.
+declare void @foo3(<8 x half>, <8 x half>)
+define void @fun6() {
+; NOVEC-LABEL: fun6:
+; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
+; NOVEC-NEXT: aghi %r15, -256
+; NOVEC-NEXT: .cfi_def_cfa_offset 416
+; NOVEC-NEXT: lhrl %r0, .LCPI6_0
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: sth %r0, 254(%r15)
+; NOVEC-NEXT: sth %r0, 246(%r15)
+; NOVEC-NEXT: sth %r0, 238(%r15)
+; NOVEC-NEXT: sth %r0, 230(%r15)
+; NOVEC-NEXT: sth %r0, 222(%r15)
+; NOVEC-NEXT: sth %r0, 214(%r15)
+; NOVEC-NEXT: sth %r0, 206(%r15)
+; NOVEC-NEXT: sth %r0, 198(%r15)
+; NOVEC-NEXT: lhrl %r0, .LCPI6_1
+; NOVEC-NEXT: sll %r0, 16
+; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32
+; NOVEC-NEXT: srl %r0, 16
+; NOVEC-NEXT: ler %f2, %f0
+; NOVEC-NEXT: ler %f4, %f0
+; NOVEC-NEXT: ler %f6, %f0
+; NOVEC-NEXT: sth %r0, 190(%r15)
+; NOVEC-NEXT: sth %r0, 182(%r15)
+; NOVEC-NEXT: sth %r0, 174(%r15)
+; NOVEC-NEXT: sth %r0, 166(%r15)
+; NOVEC-NEXT: brasl %r14, foo3 at PLT
+; NOVEC-NEXT: lmg %r14, %r15, 368(%r15)
+; NOVEC-NEXT: br %r14
+;
+; VECTOR-LABEL: fun6:
+; VECTOR: # %bb.0: # %entry
+; VECTOR-NEXT: stmg %r14, %r15, 112(%r15)
+; VECTOR-NEXT: .cfi_offset %r14, -48
+; VECTOR-NEXT: .cfi_offset %r15, -40
+; VECTOR-NEXT: aghi %r15, -256
+; VECTOR-NEXT: .cfi_def_cfa_offset 416
+; VECTOR-NEXT: lhrl %r0, .LCPI6_0
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: sth %r0, 254(%r15)
+; VECTOR-NEXT: sth %r0, 246(%r15)
+; VECTOR-NEXT: sth %r0, 238(%r15)
+; VECTOR-NEXT: sth %r0, 230(%r15)
+; VECTOR-NEXT: sth %r0, 222(%r15)
+; VECTOR-NEXT: sth %r0, 214(%r15)
+; VECTOR-NEXT: sth %r0, 206(%r15)
+; VECTOR-NEXT: sth %r0, 198(%r15)
+; VECTOR-NEXT: lhrl %r0, .LCPI6_1
+; VECTOR-NEXT: sll %r0, 16
+; VECTOR-NEXT: vlvgf %v0, %r0, 0
+; VECTOR-NEXT: vlgvf %r0, %v0, 0
+; VECTOR-NEXT: srl %r0, 16
+; VECTOR-NEXT: ldr %f2, %f0
+; VECTOR-NEXT: ldr %f4, %f0
+; VECTOR-NEXT: ldr %f6, %f0
+; VECTOR-NEXT: sth %r0, 190(%r15)
+; VECTOR-NEXT: sth %r0, 182(%r15)
+; VECTOR-NEXT: sth %r0, 174(%r15)
+; VECTOR-NEXT: sth %r0, 166(%r15)
+; VECTOR-NEXT: brasl %r14, foo3 at PLT
+; VECTOR-NEXT: lmg %r14, %r15, 368(%r15)
+; VECTOR-NEXT: br %r14
+entry:
+ call void @foo3(<8 x half> <half 0.0, half 0.0, half 0.0, half 0.0,
+ half 0.0, half 0.0, half 0.0, half 0.0>,
+ <8 x half> <half 0.375, half 0.375, half 0.375, half 0.375,
+ half 0.375, half 0.375, half 0.375, half 0.375>)
+ ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-half.ll b/llvm/test/CodeGen/SystemZ/fp-half.ll
index cbc680f90d160f9..c704efe2c26eb0f 100644
--- a/llvm/test/CodeGen/SystemZ/fp-half.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-half.ll
@@ -6,7 +6,6 @@
;
; Tests for 16-bit floating point (half).
-
; Incoming half arguments added together and returned.
define half @fun0(half %Op0, half %Op1) {
; NOVEC-LABEL: fun0:
@@ -680,6 +679,6 @@ define void @fun11() {
; VECTOR-NEXT: lmg %r14, %r15, 272(%r15)
; VECTOR-NEXT: br %r14
entry:
- call half @foo2(half 0.0, half 1.0, half 0.375)
+ call void @foo2(half 0.0, half 1.0, half 0.375)
ret void
}
diff --git a/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll b/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
index 36b3ef67c9e75a1..95114eab364e579 100644
--- a/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
@@ -3,7 +3,7 @@
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s --check-prefix=CHECK-OPT
; RUN: llc < %s -mtriple=s390x-linux-gnu -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK-OPT
-; Test f16 libcalls.
+; Test f16 libcall.
define half @f0(half %x) {
; CHECK-OPT-LABEL: f0:
; CHECK-OPT-NOT: brasl %r14, __extendhfsf2 at PLT
diff --git a/llvm/test/CodeGen/SystemZ/spill-half.mir b/llvm/test/CodeGen/SystemZ/spill-half.mir
new file mode 100644
index 000000000000000..07934103da6edc9
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/spill-half.mir
@@ -0,0 +1,39 @@
+# RUN: llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z10 -verify-machineinstrs \
+# RUN: -start-before=greedy | FileCheck %s
+
+# Test spilling / reloading of an fp16bit virtual register.
+
+---
+name: fun0
+alignment: 16
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: fp16bit }
+liveins:
+ - { reg: '$f0h', virtual-reg: '%0' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0:
+ liveins: $f0h
+
+ ; CHECK-LABEL: fun0:
+ ; CHECK-NOT: $f0
+ ; CHECK: # kill: def $f0h killed $f0h killed $f0d def $f0d
+ ; CHECK-NEXT: lgdr %r0, %f0
+ ; CHECK-NEXT: srlg %r0, %r0, 48
+ ; CHECK-NEXT: sth %r0, 166(%r15) # 2-byte Folded Spill
+ ; CHECK-NEXT: #APP
+ ; CHECK-NEXT: #NO_APP
+ ; CHECK-NEXT: lh %r0, 166(%r15) # 2-byte Folded Reload
+ ; CHECK-NEXT: sllg %r0, %r0, 48
+ ; CHECK-NEXT: ldgr %f0, %r0
+ ; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d
+ ; CHECK-NOT: $f0
+
+ %0:fp16bit = COPY $f0h
+ INLINEASM &"", 1, 12, implicit-def dead early-clobber $r0d, 12, implicit-def dead early-clobber $r1d, 12, implicit-def dead early-clobber $r2d, 12, implicit-def dead early-clobber $r3d, 12, implicit-def dead early-clobber $r4d, 12, implicit-def dead early-clobber $r5d, 12, implicit-def dead early-clobber $r6d, 12, implicit-def dead early-clobber $r7d, 12, implicit-def dead early-clobber $r8d, 12, implicit-def dead early-clobber $r9d, 12, implicit-def dead early-clobber $r10d, 12, implicit-def dead early-clobber $r11d, 12, implicit-def dead early-clobber $r12d, 12, implicit-def dead early-clobber $r13d, 12, implicit-def dead early-clobber $r14d, 12, implicit-def dead early-clobber $f0d, 12, implicit-def dead early-clobber $f1d, 12, implicit-def dead early-clobber $f2d, 12, implicit-def dead early-clobber $f3d, 12, implicit-def dead early-clobber $f4d, 12, implicit-def dead early-clobber $f5d, 12, implicit-def dead early-clobber $f6d, 12, implicit-def dead early-clobber $f7d, 12, implicit-def dead early-clobber $f8d, 12, implicit-def dead early-clobber $f9d, 12, implicit-def dead early-clobber $f10d, 12, implicit-def dead early-clobber $f11d, 12, implicit-def dead early-clobber $f12d, 12, implicit-def dead early-clobber $f13d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f15d
+ $f0h = COPY %0
+ Return implicit $f0h
+...
More information about the llvm-commits
mailing list