[llvm] [AArch64] Fix truncating stores to bf16 when NEON and SVE are unavailable (PR #94437)
Sander de Smalen via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 5 00:55:54 PDT 2024
https://github.com/sdesmalen-arm created https://github.com/llvm/llvm-project/pull/94437
There are a few changes in this patch:
* Avoid combining a truncate to bfloat16 + store into a truncating store, when +bf16 is not available.
* Handle lowering of truncating store of v2f32->v2bf16 and v4f32->v4bf16, since those can be handled using SVE. In theory, more types can be handled but this needs more work in addTypeForFixedLengthSVE.
The types that are not handled are scalarised into scalar truncating stores.
* Handle lowering of a scalar truncating store to bf16, into a FP_ROUND + bitcast + store.
* Simplification of the condition in LowerFP_ROUND() as it was unnecessary convoluted. This is NFC, but I've added an assert to ensure we don't emit a NEON vector FCVTXN when in Streaming-SVE mode.
>From fe046ad86f92663a81f5e151b60512d0a1b9ca8f Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Tue, 30 Apr 2024 07:57:10 +0100
Subject: [PATCH] [AArch64] Fix truncating stores to bf16 when NEON and SVE are
unavailable.
There are a few changes in this patch:
* Avoid combining a truncate to bfloat16 + store
into a truncating store, when +bf16 is not available.
* Handle lowering of truncating store of v2f32->v2bf16 and v4f32->v4bf16,
since those can be handled using SVE. In theory, more types can be
handled but this needs more work in addTypeForFixedLengthSVE.
The types that are not handled are scalarised into scalar truncating stores.
* Handle lowering of a scalar truncating store to bf16, into a FP_ROUND +
bitcast + store.
* Simplification of the condition in LowerFP_ROUND() as it was unnecessary
convoluted. This is NFC, but I've added an assert to ensure we don't emit
a NEON vector FCVTXN when in Streaming-SVE mode.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 35 +-
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 4 +-
llvm/lib/Target/AArch64/SVEInstrFormats.td | 7 +-
...reaming-mode-fixed-length-fp-trunc-bf16.ll | 375 ++++++++++++++++++
4 files changed, 412 insertions(+), 9 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-trunc-bf16.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ac6f1e07c4184..8df6f0b21a0ec 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1664,6 +1664,22 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
addTypeForFixedLengthSVE(VT);
}
+ // If a truncating store gets expanded we need to lower it correctly.
+ if (Subtarget->hasBF16()) {
+ setTruncStoreAction(MVT::f32, MVT::bf16, Custom);
+ setTruncStoreAction(MVT::f64, MVT::bf16, Custom);
+ }
+
+ // FIXME: useSVEForFixedLengthVectorVT currently doesn't return for 'bf16'
+ // elements and if it would, then the code in addTypeForFixedLengthSVE
+ // isn't able to handle it. To support wider types than just NEON-sized
+ // vectors, more work is needed.
+ if (Subtarget->hasBF16() && !Subtarget->isNeonAvailable()) {
+ // We can use SVE BFCVT for these operations.
+ setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Custom);
+ setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Custom);
+ }
+
// 64bit results can mean a bigger than NEON input.
for (auto VT : {MVT::v8i8, MVT::v4i16})
setOperationAction(ISD::TRUNCATE, VT, Custom);
@@ -4102,9 +4118,7 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
// Expand cases where the result type is BF16 but we don't have hardware
// instructions to lower it.
- if (VT.getScalarType() == MVT::bf16 &&
- !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
- Subtarget->hasBF16())) {
+ if (VT.getScalarType() == MVT::bf16 && !Subtarget->hasBF16()) {
SDLoc dl(Op);
SDValue Narrow = SrcVal;
SDValue NaN;
@@ -4118,7 +4132,9 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
NaN = DAG.getNode(ISD::OR, dl, I32, Narrow,
DAG.getConstant(0x400000, dl, I32));
}
- } else if (SrcVT.getScalarType() == MVT::f64) {
+ } else if (SrcVT.getScalarType() == MVT::f64 && Subtarget->hasNEON()) {
+ assert((!VT.isVector() || Subtarget->isNeonAvailable()) &&
+ "FP_ROUND should have been scalarised");
Narrow = DAG.getNode(AArch64ISD::FCVTXN, dl, F32, Narrow);
Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
} else {
@@ -6286,6 +6302,14 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
}
} else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
return LowerStore128(Op, DAG);
+ } else if (StoreNode->isTruncatingStore() && MemVT == MVT::bf16) {
+ assert(Subtarget->hasBF16() && "Should have chosen Expand for bf16");
+ SDValue FPRound = DAG.getNode(ISD::FP_ROUND, Dl, MVT::bf16, Value,
+ DAG.getIntPtrConstant(0, Dl));
+ SDValue FPRoundBC = DAG.getNode(ISD::BITCAST, Dl, MVT::f16, FPRound);
+ return DAG.getStore(StoreNode->getChain(), Dl, FPRoundBC,
+ StoreNode->getBasePtr(), StoreNode->getPointerInfo(),
+ StoreNode->getOriginalAlign());
} else if (MemVT == MVT::i64x8) {
SDValue Value = StoreNode->getValue();
assert(Value->getValueType(0) == MVT::i64x8);
@@ -22332,7 +22356,8 @@ static SDValue performSTORECombine(SDNode *N,
Subtarget->useSVEForFixedLengthVectors() &&
ValueVT.isFixedLengthVector() &&
ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
- hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
+ hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()) &&
+ (ST->getMemoryVT().getScalarType() != MVT::bf16 || Subtarget->hasBF16()))
return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
ST->getMemoryVT(), ST->getMemOperand());
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index bd5de628d8529..bf6aaa52ac50c 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2379,8 +2379,8 @@ let Predicates = [HasBF16, HasSVEorSME] in {
defm BFMLALT_ZZZ : sve2_fp_mla_long<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt>;
defm BFMLALB_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b100, "bfmlalb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalb_lane_v2>;
defm BFMLALT_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt_lane_v2>;
- defm BFCVT_ZPmZ : sve_bfloat_convert<0b1, "bfcvt", int_aarch64_sve_fcvt_bf16f32>;
- defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32>;
+ defm BFCVT_ZPmZ : sve_bfloat_convert<0b1, "bfcvt", int_aarch64_sve_fcvt_bf16f32, AArch64fcvtr_mt>;
+ defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32, null_frag>;
} // End HasBF16, HasSVEorSME
let Predicates = [HasSVEorSME] in {
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index fc7d3cdda4acd..0b9fa83e6daf0 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -8785,9 +8785,12 @@ class sve_bfloat_convert<bit N, string asm>
let mayRaiseFPException = 1;
}
-multiclass sve_bfloat_convert<bit N, string asm, SDPatternOperator op> {
+multiclass sve_bfloat_convert<bit N, string asm, SDPatternOperator int_op, SDPatternOperator ir_op> {
def NAME : sve_bfloat_convert<N, asm>;
- def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8i1, nxv4f32, !cast<Instruction>(NAME)>;
+ def : SVE_3_Op_Pat<nxv8bf16, int_op, nxv8bf16, nxv8i1, nxv4f32, !cast<Instruction>(NAME)>;
+
+ def : SVE_1_Op_Passthru_Round_Pat<nxv2bf16, ir_op, nxv2i1, nxv2f32, !cast<Instruction>(NAME)>;
+ def : SVE_1_Op_Passthru_Round_Pat<nxv4bf16, ir_op, nxv4i1, nxv4f32, !cast<Instruction>(NAME)>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-trunc-bf16.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-trunc-bf16.ll
new file mode 100644
index 0000000000000..8665b74bb07d6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-trunc-bf16.ll
@@ -0,0 +1,375 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mattr=+bf16 < %s | FileCheck %s --check-prefix=BF16-NEON
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NOBF16-NOSVE-NONEON
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s --check-prefix=NOBF16-SVE-NONEON
+; RUN: llc -mattr=+bf16 -force-streaming-compatible < %s | FileCheck %s --check-prefix=BF16-NOSVE-NONEON
+; RUN: llc -mattr=+sve,+bf16 -force-streaming-compatible < %s | FileCheck %s --check-prefix=BF16-SVE-NONEON
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @fcvt_v2f64_v2bf16(ptr %a, ptr %b) {
+; BF16-NEON-LABEL: fcvt_v2f64_v2bf16:
+; BF16-NEON: // %bb.0:
+; BF16-NEON-NEXT: ldr q0, [x0]
+; BF16-NEON-NEXT: fcvtxn v0.2s, v0.2d
+; BF16-NEON-NEXT: bfcvtn v0.4h, v0.4s
+; BF16-NEON-NEXT: str s0, [x1]
+; BF16-NEON-NEXT: ret
+;
+; NOBF16-NOSVE-NONEON-LABEL: fcvt_v2f64_v2bf16:
+; NOBF16-NOSVE-NONEON: // %bb.0:
+; NOBF16-NOSVE-NONEON-NEXT: ldr q0, [x0]
+; NOBF16-NOSVE-NONEON-NEXT: mov w8, #32767 // =0x7fff
+; NOBF16-NOSVE-NONEON-NEXT: str q0, [sp, #-32]!
+; NOBF16-NOSVE-NONEON-NEXT: .cfi_def_cfa_offset 32
+; NOBF16-NOSVE-NONEON-NEXT: ldr d0, [sp, #8]
+; NOBF16-NOSVE-NONEON-NEXT: fcvtxn s0, d0
+; NOBF16-NOSVE-NONEON-NEXT: fmov w9, s0
+; NOBF16-NOSVE-NONEON-NEXT: ubfx w10, w9, #16, #1
+; NOBF16-NOSVE-NONEON-NEXT: add w9, w9, w8
+; NOBF16-NOSVE-NONEON-NEXT: add w9, w10, w9
+; NOBF16-NOSVE-NONEON-NEXT: lsr w9, w9, #16
+; NOBF16-NOSVE-NONEON-NEXT: fmov s0, w9
+; NOBF16-NOSVE-NONEON-NEXT: str h0, [sp, #18]
+; NOBF16-NOSVE-NONEON-NEXT: ldr d0, [sp]
+; NOBF16-NOSVE-NONEON-NEXT: fcvtxn s0, d0
+; NOBF16-NOSVE-NONEON-NEXT: fmov w9, s0
+; NOBF16-NOSVE-NONEON-NEXT: ubfx w10, w9, #16, #1
+; NOBF16-NOSVE-NONEON-NEXT: add w8, w9, w8
+; NOBF16-NOSVE-NONEON-NEXT: add w8, w10, w8
+; NOBF16-NOSVE-NONEON-NEXT: lsr w8, w8, #16
+; NOBF16-NOSVE-NONEON-NEXT: fmov s0, w8
+; NOBF16-NOSVE-NONEON-NEXT: str h0, [sp, #16]
+; NOBF16-NOSVE-NONEON-NEXT: ldr d0, [sp, #16]
+; NOBF16-NOSVE-NONEON-NEXT: str d0, [sp, #24]
+; NOBF16-NOSVE-NONEON-NEXT: ldr w8, [sp, #24]
+; NOBF16-NOSVE-NONEON-NEXT: str w8, [x1]
+; NOBF16-NOSVE-NONEON-NEXT: add sp, sp, #32
+; NOBF16-NOSVE-NONEON-NEXT: ret
+;
+; NOBF16-SVE-NONEON-LABEL: fcvt_v2f64_v2bf16:
+; NOBF16-SVE-NONEON: // %bb.0:
+; NOBF16-SVE-NONEON-NEXT: sub sp, sp, #16
+; NOBF16-SVE-NONEON-NEXT: .cfi_def_cfa_offset 16
+; NOBF16-SVE-NONEON-NEXT: ldr q0, [x0]
+; NOBF16-SVE-NONEON-NEXT: mov w8, #32767 // =0x7fff
+; NOBF16-SVE-NONEON-NEXT: mov z1.d, z0.d[1]
+; NOBF16-SVE-NONEON-NEXT: fcvtxn s0, d0
+; NOBF16-SVE-NONEON-NEXT: fcvtxn s1, d1
+; NOBF16-SVE-NONEON-NEXT: fmov w9, s0
+; NOBF16-SVE-NONEON-NEXT: ubfx w10, w9, #16, #1
+; NOBF16-SVE-NONEON-NEXT: fmov w11, s1
+; NOBF16-SVE-NONEON-NEXT: add w9, w9, w8
+; NOBF16-SVE-NONEON-NEXT: add w9, w10, w9
+; NOBF16-SVE-NONEON-NEXT: ubfx w12, w11, #16, #1
+; NOBF16-SVE-NONEON-NEXT: lsr w9, w9, #16
+; NOBF16-SVE-NONEON-NEXT: add w8, w11, w8
+; NOBF16-SVE-NONEON-NEXT: add w8, w12, w8
+; NOBF16-SVE-NONEON-NEXT: fmov s0, w9
+; NOBF16-SVE-NONEON-NEXT: lsr w8, w8, #16
+; NOBF16-SVE-NONEON-NEXT: str h0, [sp, #8]
+; NOBF16-SVE-NONEON-NEXT: fmov s0, w8
+; NOBF16-SVE-NONEON-NEXT: str h0, [sp, #10]
+; NOBF16-SVE-NONEON-NEXT: ldr d0, [sp, #8]
+; NOBF16-SVE-NONEON-NEXT: fmov w8, s0
+; NOBF16-SVE-NONEON-NEXT: str w8, [x1]
+; NOBF16-SVE-NONEON-NEXT: add sp, sp, #16
+; NOBF16-SVE-NONEON-NEXT: ret
+;
+; BF16-NOSVE-NONEON-LABEL: fcvt_v2f64_v2bf16:
+; BF16-NOSVE-NONEON: // %bb.0:
+; BF16-NOSVE-NONEON-NEXT: ldr q0, [x0]
+; BF16-NOSVE-NONEON-NEXT: str q0, [sp, #-32]!
+; BF16-NOSVE-NONEON-NEXT: .cfi_def_cfa_offset 32
+; BF16-NOSVE-NONEON-NEXT: ldr d0, [sp, #8]
+; BF16-NOSVE-NONEON-NEXT: fcvtxn s0, d0
+; BF16-NOSVE-NONEON-NEXT: bfcvt h0, s0
+; BF16-NOSVE-NONEON-NEXT: str h0, [sp, #18]
+; BF16-NOSVE-NONEON-NEXT: ldr d0, [sp]
+; BF16-NOSVE-NONEON-NEXT: fcvtxn s0, d0
+; BF16-NOSVE-NONEON-NEXT: bfcvt h0, s0
+; BF16-NOSVE-NONEON-NEXT: str h0, [sp, #16]
+; BF16-NOSVE-NONEON-NEXT: ldr d0, [sp, #16]
+; BF16-NOSVE-NONEON-NEXT: str d0, [sp, #24]
+; BF16-NOSVE-NONEON-NEXT: ldr w8, [sp, #24]
+; BF16-NOSVE-NONEON-NEXT: str w8, [x1]
+; BF16-NOSVE-NONEON-NEXT: add sp, sp, #32
+; BF16-NOSVE-NONEON-NEXT: ret
+;
+; BF16-SVE-NONEON-LABEL: fcvt_v2f64_v2bf16:
+; BF16-SVE-NONEON: // %bb.0:
+; BF16-SVE-NONEON-NEXT: ldr q0, [x0]
+; BF16-SVE-NONEON-NEXT: mov z1.d, z0.d[1]
+; BF16-SVE-NONEON-NEXT: fcvtxn s0, d0
+; BF16-SVE-NONEON-NEXT: fcvtxn s1, d1
+; BF16-SVE-NONEON-NEXT: bfcvt h0, s0
+; BF16-SVE-NONEON-NEXT: bfcvt h1, s1
+; BF16-SVE-NONEON-NEXT: str h0, [x1]
+; BF16-SVE-NONEON-NEXT: str h1, [x1, #2]
+; BF16-SVE-NONEON-NEXT: ret
+ %op1 = load <2 x double>, ptr %a
+ %res = fptrunc <2 x double> %op1 to <2 x bfloat>
+ store <2 x bfloat> %res, ptr %b
+ ret void
+}
+
+define void @fcvt_v4f34_v4bf16(ptr %a, ptr %b) {
+; BF16-NEON-LABEL: fcvt_v4f34_v4bf16:
+; BF16-NEON: // %bb.0:
+; BF16-NEON-NEXT: ldr q0, [x0]
+; BF16-NEON-NEXT: bfcvtn v0.4h, v0.4s
+; BF16-NEON-NEXT: str d0, [x1]
+; BF16-NEON-NEXT: ret
+;
+; NOBF16-NOSVE-NONEON-LABEL: fcvt_v4f34_v4bf16:
+; NOBF16-NOSVE-NONEON: // %bb.0:
+; NOBF16-NOSVE-NONEON-NEXT: ldr q0, [x0]
+; NOBF16-NOSVE-NONEON-NEXT: mov w8, #32767 // =0x7fff
+; NOBF16-NOSVE-NONEON-NEXT: str q0, [sp, #-32]!
+; NOBF16-NOSVE-NONEON-NEXT: .cfi_def_cfa_offset 32
+; NOBF16-NOSVE-NONEON-NEXT: ldr s0, [sp, #12]
+; NOBF16-NOSVE-NONEON-NEXT: fmov w9, s0
+; NOBF16-NOSVE-NONEON-NEXT: fcmp s0, s0
+; NOBF16-NOSVE-NONEON-NEXT: ubfx w10, w9, #16, #1
+; NOBF16-NOSVE-NONEON-NEXT: add w11, w9, w8
+; NOBF16-NOSVE-NONEON-NEXT: orr w9, w9, #0x400000
+; NOBF16-NOSVE-NONEON-NEXT: add w10, w10, w11
+; NOBF16-NOSVE-NONEON-NEXT: csel w9, w9, w10, vs
+; NOBF16-NOSVE-NONEON-NEXT: lsr w9, w9, #16
+; NOBF16-NOSVE-NONEON-NEXT: fmov s0, w9
+; NOBF16-NOSVE-NONEON-NEXT: str h0, [sp, #30]
+; NOBF16-NOSVE-NONEON-NEXT: ldr s0, [sp, #8]
+; NOBF16-NOSVE-NONEON-NEXT: fmov w9, s0
+; NOBF16-NOSVE-NONEON-NEXT: fcmp s0, s0
+; NOBF16-NOSVE-NONEON-NEXT: ubfx w10, w9, #16, #1
+; NOBF16-NOSVE-NONEON-NEXT: add w11, w9, w8
+; NOBF16-NOSVE-NONEON-NEXT: orr w9, w9, #0x400000
+; NOBF16-NOSVE-NONEON-NEXT: add w10, w10, w11
+; NOBF16-NOSVE-NONEON-NEXT: csel w9, w9, w10, vs
+; NOBF16-NOSVE-NONEON-NEXT: lsr w9, w9, #16
+; NOBF16-NOSVE-NONEON-NEXT: fmov s0, w9
+; NOBF16-NOSVE-NONEON-NEXT: str h0, [sp, #28]
+; NOBF16-NOSVE-NONEON-NEXT: ldr s0, [sp, #4]
+; NOBF16-NOSVE-NONEON-NEXT: fmov w9, s0
+; NOBF16-NOSVE-NONEON-NEXT: fcmp s0, s0
+; NOBF16-NOSVE-NONEON-NEXT: ubfx w10, w9, #16, #1
+; NOBF16-NOSVE-NONEON-NEXT: add w11, w9, w8
+; NOBF16-NOSVE-NONEON-NEXT: orr w9, w9, #0x400000
+; NOBF16-NOSVE-NONEON-NEXT: add w10, w10, w11
+; NOBF16-NOSVE-NONEON-NEXT: csel w9, w9, w10, vs
+; NOBF16-NOSVE-NONEON-NEXT: lsr w9, w9, #16
+; NOBF16-NOSVE-NONEON-NEXT: fmov s0, w9
+; NOBF16-NOSVE-NONEON-NEXT: str h0, [sp, #26]
+; NOBF16-NOSVE-NONEON-NEXT: ldr s0, [sp]
+; NOBF16-NOSVE-NONEON-NEXT: fmov w9, s0
+; NOBF16-NOSVE-NONEON-NEXT: fcmp s0, s0
+; NOBF16-NOSVE-NONEON-NEXT: ubfx w10, w9, #16, #1
+; NOBF16-NOSVE-NONEON-NEXT: add w8, w9, w8
+; NOBF16-NOSVE-NONEON-NEXT: orr w9, w9, #0x400000
+; NOBF16-NOSVE-NONEON-NEXT: add w8, w10, w8
+; NOBF16-NOSVE-NONEON-NEXT: csel w8, w9, w8, vs
+; NOBF16-NOSVE-NONEON-NEXT: lsr w8, w8, #16
+; NOBF16-NOSVE-NONEON-NEXT: fmov s0, w8
+; NOBF16-NOSVE-NONEON-NEXT: str h0, [sp, #24]
+; NOBF16-NOSVE-NONEON-NEXT: ldr d0, [sp, #24]
+; NOBF16-NOSVE-NONEON-NEXT: str d0, [x1]
+; NOBF16-NOSVE-NONEON-NEXT: add sp, sp, #32
+; NOBF16-NOSVE-NONEON-NEXT: ret
+;
+; NOBF16-SVE-NONEON-LABEL: fcvt_v4f34_v4bf16:
+; NOBF16-SVE-NONEON: // %bb.0:
+; NOBF16-SVE-NONEON-NEXT: sub sp, sp, #16
+; NOBF16-SVE-NONEON-NEXT: .cfi_def_cfa_offset 16
+; NOBF16-SVE-NONEON-NEXT: ldr q0, [x0]
+; NOBF16-SVE-NONEON-NEXT: mov w8, #32767 // =0x7fff
+; NOBF16-SVE-NONEON-NEXT: fmov w9, s0
+; NOBF16-SVE-NONEON-NEXT: mov z1.s, z0.s[3]
+; NOBF16-SVE-NONEON-NEXT: fcmp s0, s0
+; NOBF16-SVE-NONEON-NEXT: mov z2.s, z0.s[2]
+; NOBF16-SVE-NONEON-NEXT: mov z0.s, z0.s[1]
+; NOBF16-SVE-NONEON-NEXT: ubfx w10, w9, #16, #1
+; NOBF16-SVE-NONEON-NEXT: add w11, w9, w8
+; NOBF16-SVE-NONEON-NEXT: orr w9, w9, #0x400000
+; NOBF16-SVE-NONEON-NEXT: add w10, w10, w11
+; NOBF16-SVE-NONEON-NEXT: fmov w11, s1
+; NOBF16-SVE-NONEON-NEXT: csel w9, w9, w10, vs
+; NOBF16-SVE-NONEON-NEXT: fcmp s1, s1
+; NOBF16-SVE-NONEON-NEXT: lsr w9, w9, #16
+; NOBF16-SVE-NONEON-NEXT: ubfx w10, w11, #16, #1
+; NOBF16-SVE-NONEON-NEXT: add w12, w11, w8
+; NOBF16-SVE-NONEON-NEXT: orr w11, w11, #0x400000
+; NOBF16-SVE-NONEON-NEXT: add w10, w10, w12
+; NOBF16-SVE-NONEON-NEXT: fmov w12, s2
+; NOBF16-SVE-NONEON-NEXT: csel w10, w11, w10, vs
+; NOBF16-SVE-NONEON-NEXT: fcmp s2, s2
+; NOBF16-SVE-NONEON-NEXT: lsr w10, w10, #16
+; NOBF16-SVE-NONEON-NEXT: ubfx w11, w12, #16, #1
+; NOBF16-SVE-NONEON-NEXT: add w13, w12, w8
+; NOBF16-SVE-NONEON-NEXT: orr w12, w12, #0x400000
+; NOBF16-SVE-NONEON-NEXT: add w11, w11, w13
+; NOBF16-SVE-NONEON-NEXT: fmov w13, s0
+; NOBF16-SVE-NONEON-NEXT: csel w11, w12, w11, vs
+; NOBF16-SVE-NONEON-NEXT: fcmp s0, s0
+; NOBF16-SVE-NONEON-NEXT: fmov s0, w9
+; NOBF16-SVE-NONEON-NEXT: lsr w9, w11, #16
+; NOBF16-SVE-NONEON-NEXT: ubfx w12, w13, #16, #1
+; NOBF16-SVE-NONEON-NEXT: add w8, w13, w8
+; NOBF16-SVE-NONEON-NEXT: str h0, [sp, #8]
+; NOBF16-SVE-NONEON-NEXT: fmov s0, w10
+; NOBF16-SVE-NONEON-NEXT: add w8, w12, w8
+; NOBF16-SVE-NONEON-NEXT: orr w12, w13, #0x400000
+; NOBF16-SVE-NONEON-NEXT: csel w8, w12, w8, vs
+; NOBF16-SVE-NONEON-NEXT: str h0, [sp, #14]
+; NOBF16-SVE-NONEON-NEXT: fmov s0, w9
+; NOBF16-SVE-NONEON-NEXT: lsr w8, w8, #16
+; NOBF16-SVE-NONEON-NEXT: str h0, [sp, #12]
+; NOBF16-SVE-NONEON-NEXT: fmov s0, w8
+; NOBF16-SVE-NONEON-NEXT: str h0, [sp, #10]
+; NOBF16-SVE-NONEON-NEXT: ldr d0, [sp, #8]
+; NOBF16-SVE-NONEON-NEXT: str d0, [x1]
+; NOBF16-SVE-NONEON-NEXT: add sp, sp, #16
+; NOBF16-SVE-NONEON-NEXT: ret
+;
+; BF16-NOSVE-NONEON-LABEL: fcvt_v4f34_v4bf16:
+; BF16-NOSVE-NONEON: // %bb.0:
+; BF16-NOSVE-NONEON-NEXT: ldr q0, [x0]
+; BF16-NOSVE-NONEON-NEXT: str q0, [sp, #-32]!
+; BF16-NOSVE-NONEON-NEXT: .cfi_def_cfa_offset 32
+; BF16-NOSVE-NONEON-NEXT: ldr s0, [sp, #12]
+; BF16-NOSVE-NONEON-NEXT: bfcvt h0, s0
+; BF16-NOSVE-NONEON-NEXT: str h0, [sp, #30]
+; BF16-NOSVE-NONEON-NEXT: ldr s0, [sp, #8]
+; BF16-NOSVE-NONEON-NEXT: bfcvt h0, s0
+; BF16-NOSVE-NONEON-NEXT: str h0, [sp, #28]
+; BF16-NOSVE-NONEON-NEXT: ldr s0, [sp, #4]
+; BF16-NOSVE-NONEON-NEXT: bfcvt h0, s0
+; BF16-NOSVE-NONEON-NEXT: str h0, [sp, #26]
+; BF16-NOSVE-NONEON-NEXT: ldr s0, [sp]
+; BF16-NOSVE-NONEON-NEXT: bfcvt h0, s0
+; BF16-NOSVE-NONEON-NEXT: str h0, [sp, #24]
+; BF16-NOSVE-NONEON-NEXT: ldr d0, [sp, #24]
+; BF16-NOSVE-NONEON-NEXT: str d0, [x1]
+; BF16-NOSVE-NONEON-NEXT: add sp, sp, #32
+; BF16-NOSVE-NONEON-NEXT: ret
+;
+; BF16-SVE-NONEON-LABEL: fcvt_v4f34_v4bf16:
+; BF16-SVE-NONEON: // %bb.0:
+; BF16-SVE-NONEON-NEXT: ptrue p0.s, vl4
+; BF16-SVE-NONEON-NEXT: ldr q0, [x0]
+; BF16-SVE-NONEON-NEXT: bfcvt z0.h, p0/m, z0.s
+; BF16-SVE-NONEON-NEXT: st1h { z0.s }, p0, [x1]
+; BF16-SVE-NONEON-NEXT: ret
+ %op1 = load <4 x float>, ptr %a
+ %res = fptrunc <4 x float> %op1 to <4 x bfloat>
+ store <4 x bfloat> %res, ptr %b
+ ret void
+}
+
+define void @fcvt_v2f32_v2bf16(ptr %a, ptr %b) {
+; BF16-NEON-LABEL: fcvt_v2f32_v2bf16:
+; BF16-NEON: // %bb.0:
+; BF16-NEON-NEXT: ldr d0, [x0]
+; BF16-NEON-NEXT: bfcvtn v0.4h, v0.4s
+; BF16-NEON-NEXT: str s0, [x1]
+; BF16-NEON-NEXT: ret
+;
+; NOBF16-NOSVE-NONEON-LABEL: fcvt_v2f32_v2bf16:
+; NOBF16-NOSVE-NONEON: // %bb.0:
+; NOBF16-NOSVE-NONEON-NEXT: sub sp, sp, #32
+; NOBF16-NOSVE-NONEON-NEXT: .cfi_def_cfa_offset 32
+; NOBF16-NOSVE-NONEON-NEXT: ldr d0, [x0]
+; NOBF16-NOSVE-NONEON-NEXT: mov w8, #32767 // =0x7fff
+; NOBF16-NOSVE-NONEON-NEXT: str d0, [sp, #8]
+; NOBF16-NOSVE-NONEON-NEXT: ldr s0, [sp, #12]
+; NOBF16-NOSVE-NONEON-NEXT: fmov w9, s0
+; NOBF16-NOSVE-NONEON-NEXT: fcmp s0, s0
+; NOBF16-NOSVE-NONEON-NEXT: ubfx w10, w9, #16, #1
+; NOBF16-NOSVE-NONEON-NEXT: add w11, w9, w8
+; NOBF16-NOSVE-NONEON-NEXT: orr w9, w9, #0x400000
+; NOBF16-NOSVE-NONEON-NEXT: add w10, w10, w11
+; NOBF16-NOSVE-NONEON-NEXT: csel w9, w9, w10, vs
+; NOBF16-NOSVE-NONEON-NEXT: lsr w9, w9, #16
+; NOBF16-NOSVE-NONEON-NEXT: fmov s0, w9
+; NOBF16-NOSVE-NONEON-NEXT: str h0, [sp, #18]
+; NOBF16-NOSVE-NONEON-NEXT: ldr s0, [sp, #8]
+; NOBF16-NOSVE-NONEON-NEXT: fmov w9, s0
+; NOBF16-NOSVE-NONEON-NEXT: fcmp s0, s0
+; NOBF16-NOSVE-NONEON-NEXT: ubfx w10, w9, #16, #1
+; NOBF16-NOSVE-NONEON-NEXT: add w8, w9, w8
+; NOBF16-NOSVE-NONEON-NEXT: orr w9, w9, #0x400000
+; NOBF16-NOSVE-NONEON-NEXT: add w8, w10, w8
+; NOBF16-NOSVE-NONEON-NEXT: csel w8, w9, w8, vs
+; NOBF16-NOSVE-NONEON-NEXT: lsr w8, w8, #16
+; NOBF16-NOSVE-NONEON-NEXT: fmov s0, w8
+; NOBF16-NOSVE-NONEON-NEXT: str h0, [sp, #16]
+; NOBF16-NOSVE-NONEON-NEXT: ldr d0, [sp, #16]
+; NOBF16-NOSVE-NONEON-NEXT: str d0, [sp, #24]
+; NOBF16-NOSVE-NONEON-NEXT: ldr w8, [sp, #24]
+; NOBF16-NOSVE-NONEON-NEXT: str w8, [x1]
+; NOBF16-NOSVE-NONEON-NEXT: add sp, sp, #32
+; NOBF16-NOSVE-NONEON-NEXT: ret
+;
+; NOBF16-SVE-NONEON-LABEL: fcvt_v2f32_v2bf16:
+; NOBF16-SVE-NONEON: // %bb.0:
+; NOBF16-SVE-NONEON-NEXT: sub sp, sp, #16
+; NOBF16-SVE-NONEON-NEXT: .cfi_def_cfa_offset 16
+; NOBF16-SVE-NONEON-NEXT: ldr d0, [x0]
+; NOBF16-SVE-NONEON-NEXT: mov w8, #32767 // =0x7fff
+; NOBF16-SVE-NONEON-NEXT: fmov w9, s0
+; NOBF16-SVE-NONEON-NEXT: mov z1.s, z0.s[1]
+; NOBF16-SVE-NONEON-NEXT: fcmp s0, s0
+; NOBF16-SVE-NONEON-NEXT: ubfx w10, w9, #16, #1
+; NOBF16-SVE-NONEON-NEXT: add w11, w9, w8
+; NOBF16-SVE-NONEON-NEXT: orr w9, w9, #0x400000
+; NOBF16-SVE-NONEON-NEXT: add w10, w10, w11
+; NOBF16-SVE-NONEON-NEXT: fmov w11, s1
+; NOBF16-SVE-NONEON-NEXT: csel w9, w9, w10, vs
+; NOBF16-SVE-NONEON-NEXT: fcmp s1, s1
+; NOBF16-SVE-NONEON-NEXT: lsr w9, w9, #16
+; NOBF16-SVE-NONEON-NEXT: ubfx w10, w11, #16, #1
+; NOBF16-SVE-NONEON-NEXT: add w8, w11, w8
+; NOBF16-SVE-NONEON-NEXT: fmov s0, w9
+; NOBF16-SVE-NONEON-NEXT: add w8, w10, w8
+; NOBF16-SVE-NONEON-NEXT: orr w10, w11, #0x400000
+; NOBF16-SVE-NONEON-NEXT: csel w8, w10, w8, vs
+; NOBF16-SVE-NONEON-NEXT: lsr w8, w8, #16
+; NOBF16-SVE-NONEON-NEXT: str h0, [sp, #8]
+; NOBF16-SVE-NONEON-NEXT: fmov s0, w8
+; NOBF16-SVE-NONEON-NEXT: str h0, [sp, #10]
+; NOBF16-SVE-NONEON-NEXT: ldr d0, [sp, #8]
+; NOBF16-SVE-NONEON-NEXT: fmov w8, s0
+; NOBF16-SVE-NONEON-NEXT: str w8, [x1]
+; NOBF16-SVE-NONEON-NEXT: add sp, sp, #16
+; NOBF16-SVE-NONEON-NEXT: ret
+;
+; BF16-NOSVE-NONEON-LABEL: fcvt_v2f32_v2bf16:
+; BF16-NOSVE-NONEON: // %bb.0:
+; BF16-NOSVE-NONEON-NEXT: sub sp, sp, #32
+; BF16-NOSVE-NONEON-NEXT: .cfi_def_cfa_offset 32
+; BF16-NOSVE-NONEON-NEXT: ldr d0, [x0]
+; BF16-NOSVE-NONEON-NEXT: str d0, [sp, #8]
+; BF16-NOSVE-NONEON-NEXT: ldr s0, [sp, #12]
+; BF16-NOSVE-NONEON-NEXT: bfcvt h0, s0
+; BF16-NOSVE-NONEON-NEXT: str h0, [sp, #18]
+; BF16-NOSVE-NONEON-NEXT: ldr s0, [sp, #8]
+; BF16-NOSVE-NONEON-NEXT: bfcvt h0, s0
+; BF16-NOSVE-NONEON-NEXT: str h0, [sp, #16]
+; BF16-NOSVE-NONEON-NEXT: ldr d0, [sp, #16]
+; BF16-NOSVE-NONEON-NEXT: str d0, [sp, #24]
+; BF16-NOSVE-NONEON-NEXT: ldr w8, [sp, #24]
+; BF16-NOSVE-NONEON-NEXT: str w8, [x1]
+; BF16-NOSVE-NONEON-NEXT: add sp, sp, #32
+; BF16-NOSVE-NONEON-NEXT: ret
+;
+; BF16-SVE-NONEON-LABEL: fcvt_v2f32_v2bf16:
+; BF16-SVE-NONEON: // %bb.0:
+; BF16-SVE-NONEON-NEXT: ptrue p0.s, vl2
+; BF16-SVE-NONEON-NEXT: ldr d0, [x0]
+; BF16-SVE-NONEON-NEXT: bfcvt z0.h, p0/m, z0.s
+; BF16-SVE-NONEON-NEXT: st1h { z0.s }, p0, [x1]
+; BF16-SVE-NONEON-NEXT: ret
+ %op1 = load <2 x float>, ptr %a
+ %res = fptrunc <2 x float> %op1 to <2 x bfloat>
+ store <2 x bfloat> %res, ptr %b
+ ret void
+}
More information about the llvm-commits
mailing list