[llvm] [AArch64][SVE] Use NEON for ISD::FP_ROUND cases (PR #171776)
Matthew Devereau via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 11 00:06:12 PST 2025
https://github.com/MDevereau created https://github.com/llvm/llvm-project/pull/171776
This fixes a crash where v8bf16 (ISD::FP_ROUND (v8f32 concat_vectors (v4f32, v4f32), i64) cannot be lowered when -msve-vector-bits=256. Both v8bf16 and v8f32 are legal types in this situation, but nothing can actually lower it. This should be lowered to NEON's bfcvtn/bfcvtn2.
Additionally, improve the v8f16 <- v8f32 case to use fcvtn/fcvtn2 instead of SVE.
>From c334ac32df717837d50a6dd411ccbbe566133d0a Mon Sep 17 00:00:00 2001
From: Matthew Devereau <matthew.devereau at arm.com>
Date: Thu, 11 Dec 2025 07:48:41 +0000
Subject: [PATCH] [AArch64][SVE] Use NEON for ISD::FP_ROUND cases
This fixes a crash where v8bf16 (ISD::FP_ROUND (v8f32 concat_vectors (v4f32,
v4f32), i64) cannot be lowered when -msve-vector-bits=256. Both v8bf16 and
v8f32 are legal types in this situation, but nothing can actually lower it.
This should be lowered to NEON's bfcvtn/bfcvtn2.
Additionally, improve the v8f16 <- v8f32 case to use fcvtn/fcvtn2 instead of
SVE.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 31 ++++++++++++
llvm/test/CodeGen/AArch64/fptrunc_256.ll | 50 +++++++++++++++++++
2 files changed, 81 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/fptrunc_256.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3012343386c07..d18ad55de96b7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -892,6 +892,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
LegalizeNarrowFP(MVT::bf16);
setOperationAction(ISD::FP_ROUND, MVT::v4f32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v4bf16, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::v8bf16, Custom);
// AArch64 has implementations of a lot of rounding-like FP operations.
// clang-format off
@@ -4776,6 +4777,36 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
return getSVESafeBitCast(VT, Narrow, DAG);
}
+ // Split fp_rounds where VT == 128 bits and SrcVT == 256 bits,
+ // When the minimum SVE vector length is 256 bits, it is best to manually
+ // lower this with NEON for v8f16, and v8bf16 will crash without doing so as
+ // both types are legal and will not automatically split in legalization.
+ auto SplitConcat = [&](MVT DestTy, MVT HalfDestTy, MVT HalfSrcTy) {
+ SDValue Concat = Op->getOperand(0);
+ if (Concat.getOpcode() == ISD::CONCAT_VECTORS) {
+ SDValue ConcatOp0 = Concat.getOperand(0);
+ SDValue ConcatOp1 = Concat.getOperand(1);
+ SDLoc DL(Op);
+ SDValue L = DAG.getNode(ISD::FP_ROUND, DL, HalfDestTy, ConcatOp0,
+ Op->getOperand(1));
+ SDValue R = DAG.getNode(ISD::FP_ROUND, DL, HalfDestTy, ConcatOp1,
+ Op->getOperand(1));
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, DestTy, L, R);
+ }
+ return SDValue();
+ };
+
+ if (VT == MVT::v8bf16) {
+ if (SrcVT == MVT::v8f32 && Subtarget->hasBF16())
+ if (auto Split = SplitConcat(MVT::v8bf16, MVT::v4bf16, MVT::v4f32))
+ return Split;
+ // Anything else for v8bf16 is legal
+ return Op;
+ }
+ if (VT == MVT::v8f16 && SrcVT == MVT::v8f32)
+ if (auto Split = SplitConcat(MVT::v8f16, MVT::v4f16, MVT::v4f32))
+ return Split;
+
if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
return LowerFixedLengthFPRoundToSVE(Op, DAG);
diff --git a/llvm/test/CodeGen/AArch64/fptrunc_256.ll b/llvm/test/CodeGen/AArch64/fptrunc_256.ll
new file mode 100644
index 0000000000000..3c59e6d554be4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fptrunc_256.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
+
+define <8 x bfloat> @fptrunc_poison_shuffle_v8bf16(<4 x float> %a) #0 {
+; CHECK-LABEL: fptrunc_poison_shuffle_v8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfcvtn v1.4h, v0.4s
+; CHECK-NEXT: bfcvtn2 v1.8h, v0.4s
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %shuffle = shufflevector <4 x float> %a, <4 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
+ %fpt = fptrunc <8 x float> %shuffle to <8 x bfloat>
+ ret <8 x bfloat> %fpt
+}
+
+define <8 x bfloat> @fptrunc_shuffle_v8bf16(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: fptrunc_shuffle_v8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-NEXT: bfcvtn2 v0.8h, v1.4s
+; CHECK-NEXT: ret
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %fpt = fptrunc <8 x float> %shuffle to <8 x bfloat>
+ ret <8 x bfloat> %fpt
+}
+
+define <8 x half> @fptrunc_poison_shuffle_v8f16(<4 x float> %a) #0 {
+; CHECK-LABEL: fptrunc_poison_shuffle_v8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtn v1.4h, v0.4s
+; CHECK-NEXT: fcvtn2 v1.8h, v0.4s
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %shuffle = shufflevector <4 x float> %a, <4 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
+ %fpt = fptrunc <8 x float> %shuffle to <8 x half>
+ ret <8 x half> %fpt
+}
+
+define <8 x half> @fptrunc_shuffle_v8f16(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: fptrunc_shuffle_v8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-NEXT: fcvtn2 v0.8h, v1.4s
+; CHECK-NEXT: ret
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %fpt = fptrunc <8 x float> %shuffle to <8 x half>
+ ret <8 x half> %fpt
+}
+
+attributes #0 = { vscale_range(2,2) "target-features"="+bf16,+sve" }
More information about the llvm-commits
mailing list