[llvm] 6c773a8 - [LLVM][SVE] Implement isel for bfloat fptoi and itofp operations. (#129713)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 19 04:51:45 PDT 2025
Author: Paul Walker
Date: 2025-03-19T11:51:42Z
New Revision: 6c773a87013cf82b68ba9be4672e3908a9ab6010
URL: https://github.com/llvm/llvm-project/commit/6c773a87013cf82b68ba9be4672e3908a9ab6010
DIFF: https://github.com/llvm/llvm-project/commit/6c773a87013cf82b68ba9be4672e3908a9ab6010.diff
LOG: [LLVM][SVE] Implement isel for bfloat fptoi and itofp operations. (#129713)
NOTE: This PR only considers scalable vectors because SVE VLS does not
support bfloat (see useSVEForFixedLengthVectorVT()).
Added:
llvm/test/CodeGen/AArch64/sve-bf16-int-converts.ll
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/SVEInstrFormats.td
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 879b83f94b79a..0db6c614684d7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4600,6 +4600,10 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
if (VT.isScalableVector()) {
+ // Let common code split the operation.
+ if (SrcVT == MVT::nxv8f32)
+ return Op;
+
if (VT.getScalarType() != MVT::bf16)
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
@@ -4742,6 +4746,22 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
assert(!(IsStrict && VT.isScalableVector()) &&
"Unimplemented SVE support for STRICT_FP_to_INT!");
+ // f16 conversions are promoted to f32 when full fp16 is not supported.
+ if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
+ InVT.getVectorElementType() == MVT::bf16) {
+ EVT NewVT = VT.changeElementType(MVT::f32);
+ SDLoc dl(Op);
+ if (IsStrict) {
+ SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
+ {Op.getOperand(0), Op.getOperand(1)});
+ return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
+ {Ext.getValue(1), Ext.getValue(0)});
+ }
+ return DAG.getNode(
+ Op.getOpcode(), dl, Op.getValueType(),
+ DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
+ }
+
if (VT.isScalableVector()) {
if (VT.getVectorElementType() == MVT::i1) {
SDLoc DL(Op);
@@ -4751,6 +4771,10 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
return DAG.getSetCC(DL, VT, Cvt, Zero, ISD::SETNE);
}
+ // Let common code split the operation.
+ if (InVT == MVT::nxv8f32)
+ return Op;
+
unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
? AArch64ISD::FCVTZU_MERGE_PASSTHRU
: AArch64ISD::FCVTZS_MERGE_PASSTHRU;
@@ -4761,24 +4785,6 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
return LowerFixedLengthFPToIntToSVE(Op, DAG);
- unsigned NumElts = InVT.getVectorNumElements();
-
- // f16 conversions are promoted to f32 when full fp16 is not supported.
- if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
- InVT.getVectorElementType() == MVT::bf16) {
- MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
- SDLoc dl(Op);
- if (IsStrict) {
- SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
- {Op.getOperand(0), Op.getOperand(1)});
- return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
- {Ext.getValue(1), Ext.getValue(0)});
- }
- return DAG.getNode(
- Op.getOpcode(), dl, Op.getValueType(),
- DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
- }
-
uint64_t VTSize = VT.getFixedSizeInBits();
uint64_t InVTSize = InVT.getFixedSizeInBits();
if (VTSize < InVTSize) {
@@ -4813,7 +4819,7 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
// Use a scalar operation for conversions between single-element vectors of
// the same size.
- if (NumElts == 1) {
+ if (InVT.getVectorNumElements() == 1) {
SDLoc dl(Op);
SDValue Extract = DAG.getNode(
ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
@@ -5059,23 +5065,14 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
assert(!(IsStrict && VT.isScalableVector()) &&
"Unimplemented SVE support for ISD:::STRICT_INT_TO_FP!");
- if (VT.isScalableVector()) {
- if (InVT.getVectorElementType() == MVT::i1) {
- SDValue FalseVal = DAG.getConstantFP(0.0, dl, VT);
- SDValue TrueVal = IsSigned ? DAG.getConstantFP(-1.0, dl, VT)
- : DAG.getConstantFP(1.0, dl, VT);
- return DAG.getNode(ISD::VSELECT, dl, VT, In, TrueVal, FalseVal);
- }
-
- unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
- : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
- return LowerToPredicatedOp(Op, DAG, Opcode);
+ // NOTE: i1->bf16 does not require promotion to f32.
+ if (VT.isScalableVector() && InVT.getVectorElementType() == MVT::i1) {
+ SDValue FalseVal = DAG.getConstantFP(0.0, dl, VT);
+ SDValue TrueVal = IsSigned ? DAG.getConstantFP(-1.0, dl, VT)
+ : DAG.getConstantFP(1.0, dl, VT);
+ return DAG.getNode(ISD::VSELECT, dl, VT, In, TrueVal, FalseVal);
}
- if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
- useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
- return LowerFixedLengthIntToFPToSVE(Op, DAG);
-
// Promote bf16 conversions to f32.
if (VT.getVectorElementType() == MVT::bf16) {
EVT F32 = VT.changeElementType(MVT::f32);
@@ -5092,6 +5089,20 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
}
+ if (VT.isScalableVector()) {
+ // Let common code split the operation.
+ if (VT == MVT::nxv8f32)
+ return Op;
+
+ unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
+ : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
+ return LowerToPredicatedOp(Op, DAG, Opcode);
+ }
+
+ if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
+ useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
+ return LowerFixedLengthIntToFPToSVE(Op, DAG);
+
uint64_t VTSize = VT.getFixedSizeInBits();
uint64_t InVTSize = InVT.getFixedSizeInBits();
if (VTSize < InVTSize) {
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 8d2e7f4a8ed10..eafaf1717902e 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -5465,6 +5465,14 @@ multiclass sve_int_dup_fpimm_pred<string asm> {
(!cast<Instruction>(NAME # _S) $zd, $pg, fpimm32:$imm8)>;
def : Pat<(nxv2f64 (vselect nxv2i1:$pg, (splat_vector fpimm64:$imm8), nxv2f64:$zd)),
(!cast<Instruction>(NAME # _D) $zd, $pg, fpimm64:$imm8)>;
+
+ // Some half precision immediates alias with bfloat (e.g. f16(1.875) == bf16(1.0)).
+ def : Pat<(nxv8bf16 (vselect nxv8i1:$pg, (splat_vector fpimmbf16:$imm8), nxv8bf16:$zd)),
+ (!cast<Instruction>(NAME # _H) $zd, $pg, (fpimm16XForm bf16:$imm8))>;
+ def : Pat<(nxv4bf16 (vselect nxv4i1:$pg, (splat_vector fpimmbf16:$imm8), nxv4bf16:$zd)),
+ (!cast<Instruction>(NAME # _H) $zd, $pg, (fpimm16XForm bf16:$imm8))>;
+ def : Pat<(nxv2bf16 (vselect nxv2i1:$pg, (splat_vector fpimmbf16:$imm8), nxv2bf16:$zd)),
+ (!cast<Instruction>(NAME # _H) $zd, $pg, (fpimm16XForm bf16:$imm8))>;
}
class sve_int_dup_imm_pred<bits<2> sz8_64, bit m, string asm,
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-int-converts.ll b/llvm/test/CodeGen/AArch64/sve-bf16-int-converts.ll
new file mode 100644
index 0000000000000..d6484c2483f49
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-int-converts.ll
@@ -0,0 +1,816 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 2 x i1> @fptosi_nxv2bf16_to_nxv2i1(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv2bf16_to_nxv2i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 2 x bfloat> %a to <vscale x 2 x i1>
+ ret <vscale x 2 x i1> %res
+}
+
+define <vscale x 2 x i8> @fptosi_nxv2bf16_to_nxv2i8(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv2bf16_to_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 2 x bfloat> %a to <vscale x 2 x i8>
+ ret <vscale x 2 x i8> %res
+}
+
+define <vscale x 2 x i16> @fptosi_nxv2bf16_to_nxv2i16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv2bf16_to_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 2 x bfloat> %a to <vscale x 2 x i16>
+ ret <vscale x 2 x i16> %res
+}
+
+define <vscale x 2 x i32> @fptosi_nxv2bf16_to_nxv2i32(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv2bf16_to_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 2 x bfloat> %a to <vscale x 2 x i32>
+ ret <vscale x 2 x i32> %res
+}
+
+define <vscale x 2 x i64> @fptosi_nxv2bf16_to_nxv2i64(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv2bf16_to_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 2 x bfloat> %a to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 4 x i1> @fptosi_nxv4bf16_to_nxv4i1(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv4bf16_to_nxv4i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 4 x bfloat> %a to <vscale x 4 x i1>
+ ret <vscale x 4 x i1> %res
+}
+
+define <vscale x 4 x i8> @fptosi_nxv4bf16_to_nxv4i8(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv4bf16_to_nxv4i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 4 x bfloat> %a to <vscale x 4 x i8>
+ ret <vscale x 4 x i8> %res
+}
+
+define <vscale x 4 x i16> @fptosi_nxv4bf16_to_nxv4i16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv4bf16_to_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 4 x bfloat> %a to <vscale x 4 x i16>
+ ret <vscale x 4 x i16> %res
+}
+
+define <vscale x 4 x i32> @fptosi_nxv4bf16_to_nxv4i32(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv4bf16_to_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 4 x bfloat> %a to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i64> @fptosi_nxv4bf16_to_nxv4i64(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv4bf16_to_nxv4i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpklo z1.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z2.s, z0.s, #16
+; CHECK-NEXT: movprfx z0, z1
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.s
+; CHECK-NEXT: movprfx z1, z2
+; CHECK-NEXT: fcvtzs z1.d, p0/m, z2.s
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 4 x bfloat> %a to <vscale x 4 x i64>
+ ret <vscale x 4 x i64> %res
+}
+
+define <vscale x 8 x i1> @fptosi_nxv8bf16_to_nxv8i1(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv8bf16_to_nxv8i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 8 x bfloat> %a to <vscale x 8 x i1>
+ ret <vscale x 8 x i1> %res
+}
+
+define <vscale x 8 x i8> @fptosi_nxv8bf16_to_nxv8i8(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv8bf16_to_nxv8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 8 x bfloat> %a to <vscale x 8 x i8>
+ ret <vscale x 8 x i8> %res
+}
+
+define <vscale x 8 x i16> @fptosi_nxv8bf16_to_nxv8i16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv8bf16_to_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 8 x bfloat> %a to <vscale x 8 x i16>
+ ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 8 x i32> @fptosi_nxv8bf16_to_nxv8i32(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv8bf16_to_nxv8i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpklo z1.s, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z2.s, z0.s, #16
+; CHECK-NEXT: movprfx z0, z1
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z1.s
+; CHECK-NEXT: movprfx z1, z2
+; CHECK-NEXT: fcvtzs z1.s, p0/m, z2.s
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 8 x bfloat> %a to <vscale x 8 x i32>
+ ret <vscale x 8 x i32> %res
+}
+
+define <vscale x 8 x i64> @fptosi_nxv8bf16_to_nxv8i64(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv8bf16_to_nxv8i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpklo z1.s, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpklo z2.d, z1.s
+; CHECK-NEXT: uunpkhi z1.d, z1.s
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z2.s, z2.s, #16
+; CHECK-NEXT: lsl z3.s, z3.s, #16
+; CHECK-NEXT: lsl z4.s, z0.s, #16
+; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.s
+; CHECK-NEXT: movprfx z0, z2
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z2.s
+; CHECK-NEXT: movprfx z2, z3
+; CHECK-NEXT: fcvtzs z2.d, p0/m, z3.s
+; CHECK-NEXT: movprfx z3, z4
+; CHECK-NEXT: fcvtzs z3.d, p0/m, z4.s
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 8 x bfloat> %a to <vscale x 8 x i64>
+ ret <vscale x 8 x i64> %res
+}
+
+define <vscale x 2 x i1> @fptoui_nxv2bf16_to_nxv2i1(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv2bf16_to_nxv2i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s
+; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 2 x bfloat> %a to <vscale x 2 x i1>
+ ret <vscale x 2 x i1> %res
+}
+
+; NOTE: Using fcvtzs is safe as fptoui overflow is considered poison and a
+; 64bit signed value encompasses the entire range of a 16bit unsigned value.
+define <vscale x 2 x i8> @fptoui_nxv2bf16_to_nxv2i8(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv2bf16_to_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 2 x bfloat> %a to <vscale x 2 x i8>
+ ret <vscale x 2 x i8> %res
+}
+
+define <vscale x 2 x i16> @fptoui_nxv2bf16_to_nxv2i16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv2bf16_to_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 2 x bfloat> %a to <vscale x 2 x i16>
+ ret <vscale x 2 x i16> %res
+}
+
+define <vscale x 2 x i32> @fptoui_nxv2bf16_to_nxv2i32(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv2bf16_to_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 2 x bfloat> %a to <vscale x 2 x i32>
+ ret <vscale x 2 x i32> %res
+}
+
+define <vscale x 2 x i64> @fptoui_nxv2bf16_to_nxv2i64(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv2bf16_to_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 2 x bfloat> %a to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 4 x i1> @fptoui_nxv4bf16_to_nxv4i1(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv4bf16_to_nxv4i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 4 x bfloat> %a to <vscale x 4 x i1>
+ ret <vscale x 4 x i1> %res
+}
+
+define <vscale x 4 x i8> @fptoui_nxv4bf16_to_nxv4i8(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv4bf16_to_nxv4i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 4 x bfloat> %a to <vscale x 4 x i8>
+ ret <vscale x 4 x i8> %res
+}
+
+define <vscale x 4 x i16> @fptoui_nxv4bf16_to_nxv4i16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv4bf16_to_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 4 x bfloat> %a to <vscale x 4 x i16>
+ ret <vscale x 4 x i16> %res
+}
+
+define <vscale x 4 x i32> @fptoui_nxv4bf16_to_nxv4i32(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv4bf16_to_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 4 x bfloat> %a to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i64> @fptoui_nxv4bf16_to_nxv4i64(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv4bf16_to_nxv4i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpklo z1.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z2.s, z0.s, #16
+; CHECK-NEXT: movprfx z0, z1
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z1.s
+; CHECK-NEXT: movprfx z1, z2
+; CHECK-NEXT: fcvtzu z1.d, p0/m, z2.s
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 4 x bfloat> %a to <vscale x 4 x i64>
+ ret <vscale x 4 x i64> %res
+}
+
+define <vscale x 8 x i1> @fptoui_nxv8bf16_to_nxv8i1(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv8bf16_to_nxv8i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 8 x bfloat> %a to <vscale x 8 x i1>
+ ret <vscale x 8 x i1> %res
+}
+
+define <vscale x 8 x i8> @fptoui_nxv8bf16_to_nxv8i8(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv8bf16_to_nxv8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 8 x bfloat> %a to <vscale x 8 x i8>
+ ret <vscale x 8 x i8> %res
+}
+
+define <vscale x 8 x i16> @fptoui_nxv8bf16_to_nxv8i16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv8bf16_to_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 8 x bfloat> %a to <vscale x 8 x i16>
+ ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 8 x i32> @fptoui_nxv8bf16_to_nxv8i32(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv8bf16_to_nxv8i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpklo z1.s, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z2.s, z0.s, #16
+; CHECK-NEXT: movprfx z0, z1
+; CHECK-NEXT: fcvtzu z0.s, p0/m, z1.s
+; CHECK-NEXT: movprfx z1, z2
+; CHECK-NEXT: fcvtzu z1.s, p0/m, z2.s
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 8 x bfloat> %a to <vscale x 8 x i32>
+ ret <vscale x 8 x i32> %res
+}
+
+define <vscale x 8 x i64> @fptoui_nxv8bf16_to_nxv8i64(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv8bf16_to_nxv8i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpklo z1.s, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpklo z2.d, z1.s
+; CHECK-NEXT: uunpkhi z1.d, z1.s
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z2.s, z2.s, #16
+; CHECK-NEXT: lsl z3.s, z3.s, #16
+; CHECK-NEXT: lsl z4.s, z0.s, #16
+; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.s
+; CHECK-NEXT: movprfx z0, z2
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z2.s
+; CHECK-NEXT: movprfx z2, z3
+; CHECK-NEXT: fcvtzu z2.d, p0/m, z3.s
+; CHECK-NEXT: movprfx z3, z4
+; CHECK-NEXT: fcvtzu z3.d, p0/m, z4.s
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 8 x bfloat> %a to <vscale x 8 x i64>
+ ret <vscale x 8 x i64> %res
+}
+
+; NOTE: f16(-1.875) == bf16(-1.0)
+define <vscale x 2 x bfloat> @sitofp_nxv2i1_to_nxv2bf16(<vscale x 2 x i1> %a) {
+; CHECK-LABEL: sitofp_nxv2i1_to_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.h, #0 // =0x0
+; CHECK-NEXT: fmov z0.h, p0/m, #-1.87500000
+; CHECK-NEXT: ret
+ %res = sitofp <vscale x 2 x i1> %a to <vscale x 2 x bfloat>
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 2 x bfloat> @sitofp_nxv2i8_to_nxv2bf16(<vscale x 2 x i8> %a) {
+; CHECK-LABEL: sitofp_nxv2i8_to_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT: scvtf z0.s, p0/m, z0.d
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = sitofp <vscale x 2 x i8> %a to <vscale x 2 x bfloat>
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 2 x bfloat> @sitofp_nxv2i16_to_nxv2bf16(<vscale x 2 x i16> %a) {
+; CHECK-LABEL: sitofp_nxv2i16_to_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: sxth z0.d, p0/m, z0.d
+; CHECK-NEXT: scvtf z0.s, p0/m, z0.d
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = sitofp <vscale x 2 x i16> %a to <vscale x 2 x bfloat>
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 2 x bfloat> @sitofp_nxv2i32_to_nxv2bf16(<vscale x 2 x i32> %a) {
+; CHECK-LABEL: sitofp_nxv2i32_to_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = sitofp <vscale x 2 x i32> %a to <vscale x 2 x bfloat>
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 2 x bfloat> @sitofp_nxv2i64_to_nxv2bf16(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: sitofp_nxv2i64_to_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: scvtf z0.s, p0/m, z0.d
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = sitofp <vscale x 2 x i64> %a to <vscale x 2 x bfloat>
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @sitofp_nxv4i1_to_nxv4bf16(<vscale x 4 x i1> %a) {
+; CHECK-LABEL: sitofp_nxv4i1_to_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.h, #0 // =0x0
+; CHECK-NEXT: fmov z0.h, p0/m, #-1.87500000
+; CHECK-NEXT: ret
+ %res = sitofp <vscale x 4 x i1> %a to <vscale x 4 x bfloat>
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @sitofp_nxv4i8_to_nxv4bf16(<vscale x 4 x i8> %a) {
+; CHECK-LABEL: sitofp_nxv4i8_to_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: sxtb z0.s, p0/m, z0.s
+; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = sitofp <vscale x 4 x i8> %a to <vscale x 4 x bfloat>
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @sitofp_nxv4i16_to_nxv4bf16(<vscale x 4 x i16> %a) {
+; CHECK-LABEL: sitofp_nxv4i16_to_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: sxth z0.s, p0/m, z0.s
+; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = sitofp <vscale x 4 x i16> %a to <vscale x 4 x bfloat>
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @sitofp_nxv4i32_to_nxv4bf16(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: sitofp_nxv4i32_to_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = sitofp <vscale x 4 x i32> %a to <vscale x 4 x bfloat>
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @sitofp_nxv4i64_to_nxv4bf16(<vscale x 4 x i64> %a) {
+; CHECK-LABEL: sitofp_nxv4i64_to_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: scvtf z1.s, p0/m, z1.d
+; CHECK-NEXT: scvtf z0.s, p0/m, z0.d
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+ %res = sitofp <vscale x 4 x i64> %a to <vscale x 4 x bfloat>
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @sitofp_nxv8i1_to_nxv8bf16(<vscale x 8 x i1> %a) {
+; CHECK-LABEL: sitofp_nxv8i1_to_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.h, #0 // =0x0
+; CHECK-NEXT: fmov z0.h, p0/m, #-1.87500000
+; CHECK-NEXT: ret
+ %res = sitofp <vscale x 8 x i1> %a to <vscale x 8 x bfloat>
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @sitofp_nxv8i8_to_nxv8bf16(<vscale x 8 x i8> %a) {
+; CHECK-LABEL: sitofp_nxv8i8_to_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: sxtb z0.h, p0/m, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: sunpkhi z1.s, z0.h
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: scvtf z1.s, p0/m, z1.s
+; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = sitofp <vscale x 8 x i8> %a to <vscale x 8 x bfloat>
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @sitofp_nxv8i16_to_nxv8bf16(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: sitofp_nxv8i16_to_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sunpkhi z1.s, z0.h
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: scvtf z1.s, p0/m, z1.s
+; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = sitofp <vscale x 8 x i16> %a to <vscale x 8 x bfloat>
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @sitofp_nxv8i32_to_nxv8bf16(<vscale x 8 x i32> %a) {
+; CHECK-LABEL: sitofp_nxv8i32_to_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: scvtf z1.s, p0/m, z1.s
+; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = sitofp <vscale x 8 x i32> %a to <vscale x 8 x bfloat>
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @sitofp_nxv8i64_to_nxv8bf16(<vscale x 8 x i64> %a) {
+; CHECK-LABEL: sitofp_nxv8i64_to_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: scvtf z3.s, p0/m, z3.d
+; CHECK-NEXT: scvtf z2.s, p0/m, z2.d
+; CHECK-NEXT: scvtf z1.s, p0/m, z1.d
+; CHECK-NEXT: scvtf z0.s, p0/m, z0.d
+; CHECK-NEXT: bfcvt z3.h, p0/m, z3.s
+; CHECK-NEXT: bfcvt z2.h, p0/m, z2.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT: ret
+ %res = sitofp <vscale x 8 x i64> %a to <vscale x 8 x bfloat>
+ ret <vscale x 8 x bfloat> %res
+}
+
+; NOTE: f16(1.875) == bf16(1.0)
+define <vscale x 2 x bfloat> @uitofp_nxv2i1_to_nxv2bf16(<vscale x 2 x i1> %a) {
+; CHECK-LABEL: uitofp_nxv2i1_to_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.h, #0 // =0x0
+; CHECK-NEXT: fmov z0.h, p0/m, #1.87500000
+; CHECK-NEXT: ret
+ %res = uitofp <vscale x 2 x i1> %a to <vscale x 2 x bfloat>
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 2 x bfloat> @uitofp_nxv2i8_to_nxv2bf16(<vscale x 2 x i8> %a) {
+; CHECK-LABEL: uitofp_nxv2i8_to_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.d, z0.d, #0xff
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = uitofp <vscale x 2 x i8> %a to <vscale x 2 x bfloat>
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 2 x bfloat> @uitofp_nxv2i16_to_nxv2bf16(<vscale x 2 x i16> %a) {
+; CHECK-LABEL: uitofp_nxv2i16_to_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.d, z0.d, #0xffff
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = uitofp <vscale x 2 x i16> %a to <vscale x 2 x bfloat>
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 2 x bfloat> @uitofp_nxv2i32_to_nxv2bf16(<vscale x 2 x i32> %a) {
+; CHECK-LABEL: uitofp_nxv2i32_to_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = uitofp <vscale x 2 x i32> %a to <vscale x 2 x bfloat>
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 2 x bfloat> @uitofp_nxv2i64_to_nxv2bf16(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: uitofp_nxv2i64_to_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = uitofp <vscale x 2 x i64> %a to <vscale x 2 x bfloat>
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @uitofp_nxv4i1_to_nxv4bf16(<vscale x 4 x i1> %a) {
+; CHECK-LABEL: uitofp_nxv4i1_to_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.h, #0 // =0x0
+; CHECK-NEXT: fmov z0.h, p0/m, #1.87500000
+; CHECK-NEXT: ret
+ %res = uitofp <vscale x 4 x i1> %a to <vscale x 4 x bfloat>
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @uitofp_nxv4i8_to_nxv4bf16(<vscale x 4 x i8> %a) {
+; CHECK-LABEL: uitofp_nxv4i8_to_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.s, z0.s, #0xff
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = uitofp <vscale x 4 x i8> %a to <vscale x 4 x bfloat>
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @uitofp_nxv4i16_to_nxv4bf16(<vscale x 4 x i16> %a) {
+; CHECK-LABEL: uitofp_nxv4i16_to_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.s, z0.s, #0xffff
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = uitofp <vscale x 4 x i16> %a to <vscale x 4 x bfloat>
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @uitofp_nxv4i32_to_nxv4bf16(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: uitofp_nxv4i32_to_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = uitofp <vscale x 4 x i32> %a to <vscale x 4 x bfloat>
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @uitofp_nxv4i64_to_nxv4bf16(<vscale x 4 x i64> %a) {
+; CHECK-LABEL: uitofp_nxv4i64_to_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ucvtf z1.s, p0/m, z1.d
+; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+ %res = uitofp <vscale x 4 x i64> %a to <vscale x 4 x bfloat>
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @uitofp_nxv8i1_to_nxv8bf16(<vscale x 8 x i1> %a) {
+; CHECK-LABEL: uitofp_nxv8i1_to_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.h, #0 // =0x0
+; CHECK-NEXT: fmov z0.h, p0/m, #1.87500000
+; CHECK-NEXT: ret
+ %res = uitofp <vscale x 8 x i1> %a to <vscale x 8 x bfloat>
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @uitofp_nxv8i8_to_nxv8bf16(<vscale x 8 x i8> %a) {
+; CHECK-LABEL: uitofp_nxv8i8_to_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.h, z0.h, #0xff
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ucvtf z1.s, p0/m, z1.s
+; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = uitofp <vscale x 8 x i8> %a to <vscale x 8 x bfloat>
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @uitofp_nxv8i16_to_nxv8bf16(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: uitofp_nxv8i16_to_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ucvtf z1.s, p0/m, z1.s
+; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = uitofp <vscale x 8 x i16> %a to <vscale x 8 x bfloat>
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @uitofp_nxv8i32_to_nxv8bf16(<vscale x 8 x i32> %a) {
+; CHECK-LABEL: uitofp_nxv8i32_to_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ucvtf z1.s, p0/m, z1.s
+; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = uitofp <vscale x 8 x i32> %a to <vscale x 8 x bfloat>
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @uitofp_nxv8i64_to_nxv8bf16(<vscale x 8 x i64> %a) {
+; CHECK-LABEL: uitofp_nxv8i64_to_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ucvtf z3.s, p0/m, z3.d
+; CHECK-NEXT: ucvtf z2.s, p0/m, z2.d
+; CHECK-NEXT: ucvtf z1.s, p0/m, z1.d
+; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d
+; CHECK-NEXT: bfcvt z3.h, p0/m, z3.s
+; CHECK-NEXT: bfcvt z2.h, p0/m, z2.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT: ret
+ %res = uitofp <vscale x 8 x i64> %a to <vscale x 8 x bfloat>
+ ret <vscale x 8 x bfloat> %res
+}
More information about the llvm-commits
mailing list