[llvm] 9e759f3 - [AArch64] Fix fptoi/itofp for bf16
David Majnemer via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 5 22:19:45 PST 2024
Author: David Majnemer
Date: 2024-03-06T06:17:39Z
New Revision: 9e759f3523e9e7fa955e4af58a6b4f63634dcb24
URL: https://github.com/llvm/llvm-project/commit/9e759f3523e9e7fa955e4af58a6b4f63634dcb24
DIFF: https://github.com/llvm/llvm-project/commit/9e759f3523e9e7fa955e4af58a6b4f63634dcb24.diff
LOG: [AArch64] Fix fptoi/itofp for bf16
There were a number of issues that needed to be addressed:
- i64 to bf16 did not correctly round
- strict rounding needed to yield a chain
- fastisel did not have logic to bail on bf16
Added:
Modified:
llvm/lib/Target/AArch64/AArch64FastISel.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll
llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll
llvm/test/CodeGen/AArch64/itofp.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 87c94f10e2b120..62cf6a2c47accf 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -2828,7 +2828,7 @@ bool AArch64FastISel::selectFPToInt(const Instruction *I, bool Signed) {
return false;
EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType(), true);
- if (SrcVT == MVT::f128 || SrcVT == MVT::f16)
+ if (SrcVT == MVT::f128 || SrcVT == MVT::f16 || SrcVT == MVT::bf16)
return false;
unsigned Opc;
@@ -2856,7 +2856,7 @@ bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) {
if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
return false;
// Let regular ISEL handle FP16
- if (DestVT == MVT::f16)
+ if (DestVT == MVT::f16 || DestVT == MVT::bf16)
return false;
assert((DestVT == MVT::f32 || DestVT == MVT::f64) &&
@@ -2978,7 +2978,7 @@ bool AArch64FastISel::fastLowerArguments() {
} else if (VT == MVT::i64) {
SrcReg = Registers[1][GPRIdx++];
RC = &AArch64::GPR64RegClass;
- } else if (VT == MVT::f16) {
+ } else if (VT == MVT::f16 || VT == MVT::bf16) {
SrcReg = Registers[2][FPRIdx++];
RC = &AArch64::FPR16RegClass;
} else if (VT == MVT::f32) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6061b4c1a07610..2290223a06f8ef 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4121,14 +4121,16 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
// Now that we have rounded, shift the bits into position.
Narrow = DAG.getNode(ISD::SRL, dl, I32, Narrow,
- DAG.getShiftAmountConstant(16, I32, dl));
+ DAG.getShiftAmountConstant(16, I32, dl));
if (VT.isVector()) {
EVT I16 = I32.changeVectorElementType(MVT::i16);
Narrow = DAG.getNode(ISD::TRUNCATE, dl, I16, Narrow);
return DAG.getNode(ISD::BITCAST, dl, VT, Narrow);
}
Narrow = DAG.getNode(ISD::BITCAST, dl, F32, Narrow);
- return DAG.getTargetExtractSubreg(AArch64::hsub, dl, VT, Narrow);
+ SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, dl, VT, Narrow);
+ return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, dl)
+ : Result;
}
if (SrcVT != MVT::f128) {
@@ -4487,20 +4489,121 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
bool IsStrict = Op->isStrictFPOpcode();
SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
- // f16 conversions are promoted to f32 when full fp16 is not supported.
- if ((Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) || Op.getValueType() == MVT::bf16) {
+ bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
+ Op->getOpcode() == ISD::SINT_TO_FP;
+
+ auto IntToFpViaPromotion = [&](EVT PromoteVT) {
SDLoc dl(Op);
if (IsStrict) {
- SDValue Val = DAG.getNode(Op.getOpcode(), dl, {MVT::f32, MVT::Other},
+ SDValue Val = DAG.getNode(Op.getOpcode(), dl, {PromoteVT, MVT::Other},
{Op.getOperand(0), SrcVal});
return DAG.getNode(
ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
{Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
}
- return DAG.getNode(
- ISD::FP_ROUND, dl, Op.getValueType(),
- DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
- DAG.getIntPtrConstant(0, dl));
+ return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
+ DAG.getNode(Op.getOpcode(), dl, PromoteVT, SrcVal),
+ DAG.getIntPtrConstant(0, dl));
+ };
+
+ if (Op.getValueType() == MVT::bf16) {
+ // bf16 conversions are promoted to f32 when converting from i16.
+ if (DAG.ComputeMaxSignificantBits(SrcVal) <= 24) {
+ return IntToFpViaPromotion(MVT::f32);
+ }
+
+ // bf16 conversions are promoted to f64 when converting from i32.
+ if (DAG.ComputeMaxSignificantBits(SrcVal) <= 53) {
+ return IntToFpViaPromotion(MVT::f64);
+ }
+
+ // We need to be careful about i64 -> bf16.
+ // Consider an i32 22216703.
+ // This number cannot be represented exactly as an f32 and so a itofp will
+ // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
+ // However, the correct bf16 was supposed to be 22151168.0
+ // We need to use sticky rounding to get this correct.
+ if (SrcVal.getValueType() == MVT::i64) {
+ SDLoc DL(Op);
+ // This algorithm is equivalent to the following:
+ // uint64_t SrcHi = SrcVal & ~0xfffull;
+ // uint64_t SrcLo = SrcVal & 0xfffull;
+ // uint64_t Highest = SrcVal >> 53;
+ // bool HasHighest = Highest != 0;
+ // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
+ // double Rounded = static_cast<double>(ToRound);
+ // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
+ // uint64_t HasLo = SrcLo != 0;
+ // bool NeedsAdjustment = HasHighest & HasLo;
+ // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
+ // double Adjusted = std::bit_cast<double>(AdjustedBits);
+ // return static_cast<__bf16>(Adjusted);
+ //
+ // Essentially, what happens is that SrcVal either fits perfectly in a
+ // double-precision value or it is too big. If it is sufficiently small,
+ // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
+ // ensure that u64 -> double has no rounding error by only using the 52
+ // MSB of the input. The low order bits will get merged into a sticky bit
+ // which will avoid issues incurred by double rounding.
+
+ // Signed conversion is more or less like so:
+ // copysign((__bf16)abs(SrcVal), SrcVal)
+ SDValue SignBit;
+ if (IsSigned) {
+ SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
+ DAG.getConstant(1ull << 63, DL, MVT::i64));
+ SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
+ }
+ SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
+ DAG.getConstant(~0xfffull, DL, MVT::i64));
+ SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
+ DAG.getConstant(0xfffull, DL, MVT::i64));
+ SDValue Highest =
+ DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
+ DAG.getShiftAmountConstant(53, MVT::i64, DL));
+ SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
+ SDValue ToRound =
+ DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
+ SDValue Rounded =
+ IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
+ {Op.getOperand(0), ToRound})
+ : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
+
+ SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
+ if (SignBit) {
+ RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
+ }
+
+ SDValue HasHighest = DAG.getSetCC(
+ DL,
+ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
+ Highest, Zero64, ISD::SETNE);
+
+ SDValue HasLo = DAG.getSetCC(
+ DL,
+ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
+ SrcLo, Zero64, ISD::SETNE);
+
+ SDValue NeedsAdjustment =
+ DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
+ NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
+
+ SDValue AdjustedBits =
+ DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
+ SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
+ return IsStrict
+ ? DAG.getNode(ISD::STRICT_FP_ROUND, DL,
+ {Op.getValueType(), MVT::Other},
+ {Rounded.getValue(1), Adjusted,
+ DAG.getIntPtrConstant(0, DL)})
+ : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
+ DAG.getIntPtrConstant(0, DL, true));
+ }
+ }
+
+ // f16 conversions are promoted to f32 when full fp16 is not supported.
+ if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
+ return IntToFpViaPromotion(MVT::f32);
}
// i128 conversions are libcalls.
diff --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
index 9bf638f57a5120..49325299f74a12 100644
--- a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -54,6 +54,30 @@ define <4 x half> @uitofp_v4i64_to_v4f16(ptr %ptr) {
ret <4 x half> %tmp2
}
+define <4 x bfloat> @uitofp_v4i64_to_v4bf16(ptr %ptr) {
+; CHECK-LABEL: uitofp_v4i64_to_v4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp q0, q1, [x0]
+; CHECK-NEXT: movi v2.4s, #1
+; CHECK-NEXT: ucvtf v0.2d, v0.2d
+; CHECK-NEXT: ucvtf v1.2d, v1.2d
+; CHECK-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT: movi v1.4s, #127, msl #8
+; CHECK-NEXT: ushr v3.4s, v0.4s, #16
+; CHECK-NEXT: add v1.4s, v0.4s, v1.4s
+; CHECK-NEXT: and v2.16b, v3.16b, v2.16b
+; CHECK-NEXT: add v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: fcmeq v2.4s, v0.4s, v0.4s
+; CHECK-NEXT: orr v0.4s, #64, lsl #16
+; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: shrn v0.4h, v0.4s, #16
+; CHECK-NEXT: ret
+ %tmp1 = load <4 x i64>, ptr %ptr
+ %tmp2 = uitofp <4 x i64> %tmp1 to <4 x bfloat>
+ ret <4 x bfloat> %tmp2
+}
+
define <4 x i16> @trunc_v4i64_to_v4i16(ptr %ptr) {
; CHECK-LABEL: trunc_v4i64_to_v4i16:
; CHECK: // %bb.0:
diff --git a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll
index b3c073f5354204..1aa28f5c2733db 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll
@@ -1,129 +1,416 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -O0 -fast-isel -verify-machineinstrs -mtriple=arm64-eabi < %s | FileCheck --enable-var-scope %s
; Test fptosi
define i32 @fptosi_wh(half %a) nounwind ssp {
+; CHECK-LABEL: fptosi_wh:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvtzs w0, s0
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: fptosi_wh
-; CHECK: fcvt [[REG:s[0-9]+]], h0
-; CHECK: fcvtzs w0, [[REG]]
%conv = fptosi half %a to i32
ret i32 %conv
}
; Test fptoui
define i32 @fptoui_swh(half %a) nounwind ssp {
+; CHECK-LABEL: fptoui_swh:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvtzu w0, s0
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: fptoui_swh
-; CHECK: fcvt [[REG:s[0-9]+]], h0
-; CHECK: fcvtzu w0, [[REG]]
%conv = fptoui half %a to i32
ret i32 %conv
}
; Test sitofp
define half @sitofp_hw_i1(i1 %a) nounwind ssp {
+; CHECK-LABEL: sitofp_hw_i1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sbfx w8, w0, #0, #1
+; CHECK-NEXT: scvtf s0, w8
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: sitofp_hw_i1
-; CHECK: sbfx [[REG:w[0-9]+]], w0, #0, #1
-; CHECK: scvtf s0, [[REG]]
-; CHECK: fcvt h0, s0
%conv = sitofp i1 %a to half
ret half %conv
}
; Test sitofp
define half @sitofp_hw_i8(i8 %a) nounwind ssp {
+; CHECK-LABEL: sitofp_hw_i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sxtb w8, w0
+; CHECK-NEXT: scvtf s0, w8
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: sitofp_hw_i8
-; CHECK: sxtb [[REG:w[0-9]+]], w0
-; CHECK: scvtf s0, [[REG]]
-; CHECK: fcvt h0, s0
%conv = sitofp i8 %a to half
ret half %conv
}
; Test sitofp
define half @sitofp_hw_i16(i16 %a) nounwind ssp {
+; CHECK-LABEL: sitofp_hw_i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sxth w8, w0
+; CHECK-NEXT: scvtf s0, w8
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: sitofp_hw_i16
-; CHECK: sxth [[REG:w[0-9]+]], w0
-; CHECK: scvtf s0, [[REG]]
-; CHECK: fcvt h0, s0
%conv = sitofp i16 %a to half
ret half %conv
}
; Test sitofp
define half @sitofp_hw_i32(i32 %a) nounwind ssp {
+; CHECK-LABEL: sitofp_hw_i32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: scvtf s0, w0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: sitofp_hw_i32
-; CHECK: scvtf s0, w0
-; CHECK: fcvt h0, s0
%conv = sitofp i32 %a to half
ret half %conv
}
; Test sitofp
define half @sitofp_hx(i64 %a) nounwind ssp {
+; CHECK-LABEL: sitofp_hx:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: scvtf s0, x0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: sitofp_hx
-; CHECK: scvtf s0, x0
-; CHECK: fcvt h0, s0
%conv = sitofp i64 %a to half
ret half %conv
}
; Test uitofp
define half @uitofp_hw_i1(i1 %a) nounwind ssp {
+; CHECK-LABEL: uitofp_hw_i1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: and w8, w0, #0x1
+; CHECK-NEXT: ucvtf s0, w8
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: uitofp_hw_i1
-; CHECK: and [[REG:w[0-9]+]], w0, #0x1
-; CHECK: ucvtf s0, [[REG]]
-; CHECK: fcvt h0, s0
%conv = uitofp i1 %a to half
ret half %conv
}
; Test uitofp
define half @uitofp_hw_i8(i8 %a) nounwind ssp {
+; CHECK-LABEL: uitofp_hw_i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: and w8, w0, #0xff
+; CHECK-NEXT: ucvtf s0, w8
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: uitofp_hw_i8
-; CHECK: and [[REG:w[0-9]+]], w0, #0xff
-; CHECK: ucvtf s0, [[REG]]
-; CHECK: fcvt h0, s0
%conv = uitofp i8 %a to half
ret half %conv
}
; Test uitofp
define half @uitofp_hw_i16(i16 %a) nounwind ssp {
+; CHECK-LABEL: uitofp_hw_i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: and w8, w0, #0xffff
+; CHECK-NEXT: ucvtf s0, w8
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: uitofp_hw_i16
-; CHECK: and [[REG:w[0-9]+]], w0, #0xffff
-; CHECK: ucvtf s0, [[REG]]
-; CHECK: fcvt h0, s0
%conv = uitofp i16 %a to half
ret half %conv
}
; Test uitofp
define half @uitofp_hw_i32(i32 %a) nounwind ssp {
+; CHECK-LABEL: uitofp_hw_i32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ucvtf s0, w0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: uitofp_hw_i32
-; CHECK: ucvtf s0, w0
-; CHECK: fcvt h0, s0
%conv = uitofp i32 %a to half
ret half %conv
}
; Test uitofp
define half @uitofp_hx(i64 %a) nounwind ssp {
+; CHECK-LABEL: uitofp_hx:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ucvtf s0, x0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: uitofp_hx
-; CHECK: ucvtf s0, x0
-; CHECK: fcvt h0, s0
%conv = uitofp i64 %a to half
ret half %conv
}
+; Test fptosi
+define i32 @fptosi_bf(bfloat %a) nounwind ssp {
+; CHECK-LABEL: fptosi_bf:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov s1, s0
+; CHECK-NEXT: // implicit-def: $s0
+; CHECK-NEXT: fmov s0, s1
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fcvtzs w0, s0
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi bfloat %a to i32
+ ret i32 %conv
+}
+
+; Test fptoui
+define i32 @fptoui_sbf(bfloat %a) nounwind ssp {
+; CHECK-LABEL: fptoui_sbf:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov s1, s0
+; CHECK-NEXT: // implicit-def: $s0
+; CHECK-NEXT: fmov s0, s1
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fcvtzu w0, s0
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui bfloat %a to i32
+ ret i32 %conv
+}
+
+; Test sitofp
+define bfloat @sitofp_bf_i1(i1 %a) nounwind ssp {
+; CHECK-LABEL: sitofp_bf_i1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sbfx w8, w0, #0, #1
+; CHECK-NEXT: scvtf s0, w8
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: ubfx w8, w9, #16, #1
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: mov w9, #32767 // =0x7fff
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+entry:
+ %conv = sitofp i1 %a to bfloat
+ ret bfloat %conv
+}
+
+; Test sitofp
+define bfloat @sitofp_bf_i8(i8 %a) nounwind ssp {
+; CHECK-LABEL: sitofp_bf_i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sxtb w8, w0
+; CHECK-NEXT: scvtf s0, w8
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: ubfx w8, w9, #16, #1
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: mov w9, #32767 // =0x7fff
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+entry:
+ %conv = sitofp i8 %a to bfloat
+ ret bfloat %conv
+}
+
+; Test sitofp
+define bfloat @sitofp_bf_i16(i16 %a) nounwind ssp {
+; CHECK-LABEL: sitofp_bf_i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sxth w8, w0
+; CHECK-NEXT: scvtf s0, w8
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: ubfx w8, w9, #16, #1
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: mov w9, #32767 // =0x7fff
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+entry:
+ %conv = sitofp i16 %a to bfloat
+ ret bfloat %conv
+}
+
+; Test sitofp
+define bfloat @sitofp_bf_i32(i32 %a) nounwind ssp {
+; CHECK-LABEL: sitofp_bf_i32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: scvtf d0, w0
+; CHECK-NEXT: fcvtxn s0, d0
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: ubfx w8, w9, #16, #1
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: mov w9, #32767 // =0x7fff
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+entry:
+ %conv = sitofp i32 %a to bfloat
+ ret bfloat %conv
+}
+
+; Test sitofp
+define bfloat @sitofp_bf_i164(i64 %a) nounwind ssp {
+; CHECK-LABEL: sitofp_bf_i164:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: subs x8, x0, #0
+; CHECK-NEXT: cneg x10, x0, mi
+; CHECK-NEXT: and x8, x10, #0xfffffffffffff000
+; CHECK-NEXT: lsr x9, x10, #53
+; CHECK-NEXT: subs x9, x9, #0
+; CHECK-NEXT: csel x8, x8, x10, ne
+; CHECK-NEXT: scvtf d0, x8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: and x9, x0, #0x8000000000000000
+; CHECK-NEXT: orr x8, x8, x9
+; CHECK-NEXT: cset w9, ne
+; CHECK-NEXT: ands x10, x10, #0xfff
+; CHECK-NEXT: csel w9, wzr, w9, eq
+; CHECK-NEXT: mov w9, w9
+; CHECK-NEXT: // kill: def $x9 killed $w9
+; CHECK-NEXT: orr x8, x8, x9
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: fcvtxn s0, d0
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: ubfx w8, w9, #16, #1
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: mov w9, #32767 // =0x7fff
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+entry:
+ %conv = sitofp i64 %a to bfloat
+ ret bfloat %conv
+}
+
+; Test uitofp
+define bfloat @uitofp_bf_i1(i1 %a) nounwind ssp {
+; CHECK-LABEL: uitofp_bf_i1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: and w8, w0, #0x1
+; CHECK-NEXT: ucvtf s0, w8
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: ubfx w8, w9, #16, #1
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: mov w9, #32767 // =0x7fff
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+entry:
+ %conv = uitofp i1 %a to bfloat
+ ret bfloat %conv
+}
+
+; Test uitofp
+define bfloat @uitofp_bf_i8(i8 %a) nounwind ssp {
+; CHECK-LABEL: uitofp_bf_i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: and w8, w0, #0xff
+; CHECK-NEXT: ucvtf s0, w8
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: ubfx w8, w9, #16, #1
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: mov w9, #32767 // =0x7fff
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+entry:
+ %conv = uitofp i8 %a to bfloat
+ ret bfloat %conv
+}
+
+; Test uitofp
+define bfloat @uitofp_bf_i16(i16 %a) nounwind ssp {
+; CHECK-LABEL: uitofp_bf_i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: and w8, w0, #0xffff
+; CHECK-NEXT: ucvtf s0, w8
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: ubfx w8, w9, #16, #1
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: mov w9, #32767 // =0x7fff
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+entry:
+ %conv = uitofp i16 %a to bfloat
+ ret bfloat %conv
+}
+
+; Test uitofp
+define bfloat @uitofp_bf_i32(i32 %a) nounwind ssp {
+; CHECK-LABEL: uitofp_bf_i32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ucvtf d0, w0
+; CHECK-NEXT: fcvtxn s0, d0
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: ubfx w8, w9, #16, #1
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: mov w9, #32767 // =0x7fff
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+entry:
+ %conv = uitofp i32 %a to bfloat
+ ret bfloat %conv
+}
+
+; Test uitofp
+define bfloat @uitofp_bf_i64(i64 %a) nounwind ssp {
+; CHECK-LABEL: uitofp_bf_i64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: and x8, x0, #0xfffffffffffff000
+; CHECK-NEXT: lsr x9, x0, #53
+; CHECK-NEXT: subs x9, x9, #0
+; CHECK-NEXT: csel x8, x8, x0, ne
+; CHECK-NEXT: ucvtf d0, x8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: cset w9, ne
+; CHECK-NEXT: ands x10, x0, #0xfff
+; CHECK-NEXT: csel w9, wzr, w9, eq
+; CHECK-NEXT: mov w9, w9
+; CHECK-NEXT: // kill: def $x9 killed $w9
+; CHECK-NEXT: orr x8, x8, x9
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: fcvtxn s0, d0
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: ubfx w8, w9, #16, #1
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: mov w9, #32767 // =0x7fff
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+entry:
+ %conv = uitofp i64 %a to bfloat
+ ret bfloat %conv
+}
+
diff --git a/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll
index d7bdf2d264c4e5..049098ab2ae97d 100644
--- a/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll
+++ b/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll
@@ -73,6 +73,56 @@ entry:
ret half %conv1
}
+define bfloat @t7(bfloat %x) {
+; CHECK-LABEL: t7:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w9
+; CHECK-NEXT: fcvtzs w9, s0
+; CHECK-NEXT: scvtf d0, w9
+; CHECK-NEXT: fcvtxn s0, d0
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: ubfx w10, w9, #16, #1
+; CHECK-NEXT: add w8, w9, w8
+; CHECK-NEXT: add w8, w10, w8
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi bfloat %x to i32
+ %conv1 = sitofp i32 %conv to bfloat
+ ret bfloat %conv1
+}
+
+define bfloat @t8(bfloat %x) {
+; CHECK-LABEL: t8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w9
+; CHECK-NEXT: fcvtzu w9, s0
+; CHECK-NEXT: ucvtf d0, w9
+; CHECK-NEXT: fcvtxn s0, d0
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: ubfx w10, w9, #16, #1
+; CHECK-NEXT: add w8, w9, w8
+; CHECK-NEXT: add w8, w10, w8
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui bfloat %x to i32
+ %conv1 = uitofp i32 %conv to bfloat
+ ret bfloat %conv1
+}
+
define double @t1_strict(double %x) #0 {
; CHECK-LABEL: t1_strict:
; CHECK: // %bb.0: // %entry
@@ -145,14 +195,68 @@ entry:
ret half %conv1
}
+define bfloat @t7_strict(bfloat %x) #0 {
+; CHECK-LABEL: t7_strict:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w9
+; CHECK-NEXT: fcvtzs w9, s0
+; CHECK-NEXT: scvtf d0, w9
+; CHECK-NEXT: fcvtxn s0, d0
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: ubfx w10, w9, #16, #1
+; CHECK-NEXT: add w8, w9, w8
+; CHECK-NEXT: add w8, w10, w8
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+entry:
+ %conv = call i32 @llvm.experimental.constrained.fptosi.i32.bf16(bfloat %x, metadata !"fpexcept.strict") #0
+ %conv1 = call bfloat @llvm.experimental.constrained.sitofp.i32.bf16(i32 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+ ret bfloat %conv1
+}
+
+define bfloat @t8_strict(bfloat %x) #0 {
+; CHECK-LABEL: t8_strict:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w9
+; CHECK-NEXT: fcvtzu w9, s0
+; CHECK-NEXT: ucvtf d0, w9
+; CHECK-NEXT: fcvtxn s0, d0
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: ubfx w10, w9, #16, #1
+; CHECK-NEXT: add w8, w9, w8
+; CHECK-NEXT: add w8, w10, w8
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+entry:
+ %conv = call i32 @llvm.experimental.constrained.fptoui.i32.bf16(bfloat %x, metadata !"fpexcept.strict") #0
+ %conv1 = call bfloat @llvm.experimental.constrained.uitofp.i32.bf16(i32 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+ ret bfloat %conv1
+}
+
attributes #0 = { strictfp }
+declare i32 @llvm.experimental.constrained.fptosi.i32.bf16(bfloat, metadata)
+declare i32 @llvm.experimental.constrained.fptoui.i32.bf16(bfloat, metadata)
declare i32 @llvm.experimental.constrained.fptosi.i32.f16(half, metadata)
declare i32 @llvm.experimental.constrained.fptoui.i32.f16(half, metadata)
declare i32 @llvm.experimental.constrained.fptosi.i32.f32(float, metadata)
declare i32 @llvm.experimental.constrained.fptoui.i32.f32(float, metadata)
declare i64 @llvm.experimental.constrained.fptosi.i64.f64(double, metadata)
declare i64 @llvm.experimental.constrained.fptoui.i64.f64(double, metadata)
+declare bfloat @llvm.experimental.constrained.sitofp.i32.bf16(i32, metadata, metadata)
+declare bfloat @llvm.experimental.constrained.uitofp.i32.bf16(i32, metadata, metadata)
declare half @llvm.experimental.constrained.sitofp.i32.f16(i32, metadata, metadata)
declare half @llvm.experimental.constrained.uitofp.i32.f16(i32, metadata, metadata)
declare float @llvm.experimental.constrained.sitofp.i32.f32(i32, metadata, metadata)
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index be57e1e26a9263..708bb43887f866 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -6206,8 +6206,23 @@ entry:
define bfloat @stofp_i64_bf16(i64 %a) {
; CHECK-LABEL: stofp_i64_bf16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: scvtf s0, x0
+; CHECK-NEXT: cmp x0, #0
+; CHECK-NEXT: and x11, x0, #0x8000000000000000
; CHECK-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-NEXT: cneg x9, x0, mi
+; CHECK-NEXT: lsr x10, x9, #53
+; CHECK-NEXT: cmp x10, #0
+; CHECK-NEXT: and x10, x9, #0xfffffffffffff000
+; CHECK-NEXT: csel x10, x10, x9, ne
+; CHECK-NEXT: scvtf d0, x10
+; CHECK-NEXT: cset w10, ne
+; CHECK-NEXT: tst x9, #0xfff
+; CHECK-NEXT: csel w10, wzr, w10, eq
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: orr x9, x9, x11
+; CHECK-NEXT: orr x9, x9, x10
+; CHECK-NEXT: fmov d0, x9
+; CHECK-NEXT: fcvtxn s0, d0
; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: ubfx w10, w9, #16, #1
; CHECK-NEXT: add w8, w9, w8
@@ -6224,8 +6239,19 @@ entry:
define bfloat @utofp_i64_bf16(i64 %a) {
; CHECK-LABEL: utofp_i64_bf16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ucvtf s0, x0
+; CHECK-NEXT: lsr x9, x0, #53
; CHECK-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-NEXT: cmp x9, #0
+; CHECK-NEXT: and x9, x0, #0xfffffffffffff000
+; CHECK-NEXT: csel x9, x9, x0, ne
+; CHECK-NEXT: ucvtf d0, x9
+; CHECK-NEXT: cset w9, ne
+; CHECK-NEXT: tst x0, #0xfff
+; CHECK-NEXT: csel w9, wzr, w9, eq
+; CHECK-NEXT: fmov x10, d0
+; CHECK-NEXT: orr x9, x10, x9
+; CHECK-NEXT: fmov d0, x9
+; CHECK-NEXT: fcvtxn s0, d0
; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: ubfx w10, w9, #16, #1
; CHECK-NEXT: add w8, w9, w8
@@ -6242,8 +6268,9 @@ entry:
define bfloat @stofp_i32_bf16(i32 %a) {
; CHECK-LABEL: stofp_i32_bf16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: scvtf s0, w0
+; CHECK-NEXT: scvtf d0, w0
; CHECK-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-NEXT: fcvtxn s0, d0
; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: ubfx w10, w9, #16, #1
; CHECK-NEXT: add w8, w9, w8
@@ -6260,8 +6287,9 @@ entry:
define bfloat @utofp_i32_bf16(i32 %a) {
; CHECK-LABEL: utofp_i32_bf16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ucvtf s0, w0
+; CHECK-NEXT: ucvtf d0, w0
; CHECK-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-NEXT: fcvtxn s0, d0
; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: ubfx w10, w9, #16, #1
; CHECK-NEXT: add w8, w9, w8
@@ -6355,22 +6383,52 @@ define <2 x bfloat> @stofp_v2i64_v2bf16(<2 x i64> %a) {
; CHECK-LABEL: stofp_v2i64_v2bf16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov x9, v0.d[1]
-; CHECK-NEXT: fmov x10, d0
; CHECK-NEXT: mov w8, #32767 // =0x7fff
-; CHECK-NEXT: scvtf s1, x10
-; CHECK-NEXT: scvtf s0, x9
-; CHECK-NEXT: fmov w10, s1
+; CHECK-NEXT: cmp x9, #0
+; CHECK-NEXT: cneg x10, x9, mi
+; CHECK-NEXT: and x9, x9, #0x8000000000000000
+; CHECK-NEXT: lsr x11, x10, #53
+; CHECK-NEXT: and x12, x10, #0xfffffffffffff000
+; CHECK-NEXT: cmp x11, #0
+; CHECK-NEXT: csel x11, x12, x10, ne
+; CHECK-NEXT: cset w12, ne
+; CHECK-NEXT: tst x10, #0xfff
+; CHECK-NEXT: fmov x10, d0
+; CHECK-NEXT: csel w12, wzr, w12, eq
+; CHECK-NEXT: scvtf d0, x11
+; CHECK-NEXT: cmp x10, #0
+; CHECK-NEXT: cneg x13, x10, mi
+; CHECK-NEXT: and x10, x10, #0x8000000000000000
+; CHECK-NEXT: lsr x14, x13, #53
+; CHECK-NEXT: cmp x14, #0
+; CHECK-NEXT: and x14, x13, #0xfffffffffffff000
+; CHECK-NEXT: csel x11, x14, x13, ne
+; CHECK-NEXT: cset w14, ne
+; CHECK-NEXT: tst x13, #0xfff
+; CHECK-NEXT: scvtf d1, x11
+; CHECK-NEXT: fmov x11, d0
+; CHECK-NEXT: orr x9, x11, x9
+; CHECK-NEXT: csel w11, wzr, w14, eq
+; CHECK-NEXT: fmov x13, d1
+; CHECK-NEXT: orr x9, x9, x12
+; CHECK-NEXT: fmov d0, x9
+; CHECK-NEXT: orr x10, x13, x10
+; CHECK-NEXT: orr x10, x10, x11
+; CHECK-NEXT: fcvtxn s0, d0
+; CHECK-NEXT: fmov d1, x10
+; CHECK-NEXT: fcvtxn s1, d1
; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: ubfx w12, w10, #16, #1
; CHECK-NEXT: ubfx w11, w9, #16, #1
; CHECK-NEXT: add w9, w9, w8
+; CHECK-NEXT: fmov w10, s1
+; CHECK-NEXT: add w9, w11, w9
+; CHECK-NEXT: lsr w9, w9, #16
+; CHECK-NEXT: ubfx w12, w10, #16, #1
; CHECK-NEXT: add w8, w10, w8
+; CHECK-NEXT: fmov s1, w9
; CHECK-NEXT: add w8, w12, w8
-; CHECK-NEXT: add w9, w11, w9
; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: lsr w9, w9, #16
; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: fmov s1, w9
; CHECK-NEXT: mov v0.h[1], v1.h[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
@@ -6383,22 +6441,44 @@ define <2 x bfloat> @utofp_v2i64_v2bf16(<2 x i64> %a) {
; CHECK-LABEL: utofp_v2i64_v2bf16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov x9, v0.d[1]
-; CHECK-NEXT: fmov x10, d0
+; CHECK-NEXT: fmov x11, d0
; CHECK-NEXT: mov w8, #32767 // =0x7fff
-; CHECK-NEXT: ucvtf s1, x10
-; CHECK-NEXT: ucvtf s0, x9
-; CHECK-NEXT: fmov w10, s1
+; CHECK-NEXT: lsr x10, x9, #53
+; CHECK-NEXT: and x12, x9, #0xfffffffffffff000
+; CHECK-NEXT: cmp x10, #0
+; CHECK-NEXT: lsr x10, x11, #53
+; CHECK-NEXT: csel x12, x12, x9, ne
+; CHECK-NEXT: cset w13, ne
+; CHECK-NEXT: tst x9, #0xfff
+; CHECK-NEXT: csel w9, wzr, w13, eq
+; CHECK-NEXT: cmp x10, #0
+; CHECK-NEXT: and x10, x11, #0xfffffffffffff000
+; CHECK-NEXT: csel x10, x10, x11, ne
+; CHECK-NEXT: ucvtf d0, x12
+; CHECK-NEXT: ucvtf d1, x10
+; CHECK-NEXT: cset w10, ne
+; CHECK-NEXT: tst x11, #0xfff
+; CHECK-NEXT: csel w10, wzr, w10, eq
+; CHECK-NEXT: fmov x11, d0
+; CHECK-NEXT: fmov x12, d1
+; CHECK-NEXT: orr x9, x11, x9
+; CHECK-NEXT: orr x10, x12, x10
+; CHECK-NEXT: fmov d0, x9
+; CHECK-NEXT: fmov d1, x10
+; CHECK-NEXT: fcvtxn s0, d0
+; CHECK-NEXT: fcvtxn s1, d1
; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: ubfx w12, w10, #16, #1
+; CHECK-NEXT: fmov w10, s1
; CHECK-NEXT: ubfx w11, w9, #16, #1
; CHECK-NEXT: add w9, w9, w8
+; CHECK-NEXT: ubfx w12, w10, #16, #1
; CHECK-NEXT: add w8, w10, w8
-; CHECK-NEXT: add w8, w12, w8
; CHECK-NEXT: add w9, w11, w9
-; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: add w8, w12, w8
; CHECK-NEXT: lsr w9, w9, #16
-; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: lsr w8, w8, #16
; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: mov v0.h[1], v1.h[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
More information about the llvm-commits
mailing list