[llvm] [AArch64][SVE] Use NEON for ISD::FP_ROUND cases (PR #171776)

Thu Dec 11 00:06:12 PST 2025

https://github.com/MDevereau created https://github.com/llvm/llvm-project/pull/171776

This fixes a crash where v8bf16 (ISD::FP_ROUND (v8f32 concat_vectors (v4f32, v4f32), i64) cannot be lowered when -msve-vector-bits=256. Both v8bf16 and v8f32 are legal types in this situation, but nothing can actually lower it. This should be lowered to NEON's bfcvtn/bfcvtn2.

Additionally, improve the v8f16 <- v8f32 case to use fcvtn/fcvtn2 instead of SVE.

>From c334ac32df717837d50a6dd411ccbbe566133d0a Mon Sep 17 00:00:00 2001
From: Matthew Devereau <matthew.devereau at arm.com>
Date: Thu, 11 Dec 2025 07:48:41 +0000
Subject: [PATCH] [AArch64][SVE] Use NEON for ISD::FP_ROUND cases

This fixes a crash where v8bf16 (ISD::FP_ROUND (v8f32 concat_vectors (v4f32,
v4f32), i64) cannot be lowered when -msve-vector-bits=256. Both v8bf16 and
v8f32 are legal types in this situation, but nothing can actually lower it.
This should be lowered to NEON's bfcvtn/bfcvtn2.

Additionally, improve the v8f16 <- v8f32 case to use fcvtn/fcvtn2 instead of
SVE.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 31 ++++++++++++
 llvm/test/CodeGen/AArch64/fptrunc_256.ll      | 50 +++++++++++++++++++
 2 files changed, 81 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/fptrunc_256.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3012343386c07..d18ad55de96b7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -892,6 +892,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   LegalizeNarrowFP(MVT::bf16);
   setOperationAction(ISD::FP_ROUND, MVT::v4f32, Custom);
   setOperationAction(ISD::FP_ROUND, MVT::v4bf16, Custom);
+  setOperationAction(ISD::FP_ROUND, MVT::v8bf16, Custom);
 
   // AArch64 has implementations of a lot of rounding-like FP operations.
   // clang-format off
@@ -4776,6 +4777,36 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
     return getSVESafeBitCast(VT, Narrow, DAG);
   }
 
+  // Split fp_rounds where VT == 128 bits and SrcVT == 256 bits,
+  // When the minimum SVE vector length is 256 bits, it is best to manually
+  // lower this with NEON for v8f16, and v8bf16 will crash without doing so as
+  // both types are legal and will not automatically split in legalization.
+  auto SplitConcat = [&](MVT DestTy, MVT HalfDestTy, MVT HalfSrcTy) {
+    SDValue Concat = Op->getOperand(0);
+    if (Concat.getOpcode() == ISD::CONCAT_VECTORS) {
+      SDValue ConcatOp0 = Concat.getOperand(0);
+      SDValue ConcatOp1 = Concat.getOperand(1);
+      SDLoc DL(Op);
+      SDValue L = DAG.getNode(ISD::FP_ROUND, DL, HalfDestTy, ConcatOp0,
+                              Op->getOperand(1));
+      SDValue R = DAG.getNode(ISD::FP_ROUND, DL, HalfDestTy, ConcatOp1,
+                              Op->getOperand(1));
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, DestTy, L, R);
+    }
+    return SDValue();
+  };
+
+  if (VT == MVT::v8bf16) {
+    if (SrcVT == MVT::v8f32 && Subtarget->hasBF16())
+      if (auto Split = SplitConcat(MVT::v8bf16, MVT::v4bf16, MVT::v4f32))
+        return Split;
+    // Anything else for v8bf16 is legal
+    return Op;
+  }
+  if (VT == MVT::v8f16 && SrcVT == MVT::v8f32)
+    if (auto Split = SplitConcat(MVT::v8f16, MVT::v4f16, MVT::v4f32))
+      return Split;
+
   if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
     return LowerFixedLengthFPRoundToSVE(Op, DAG);
 
diff --git a/llvm/test/CodeGen/AArch64/fptrunc_256.ll b/llvm/test/CodeGen/AArch64/fptrunc_256.ll
new file mode 100644
index 0000000000000..3c59e6d554be4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fptrunc_256.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
+
+define <8 x bfloat> @fptrunc_poison_shuffle_v8bf16(<4 x float> %a) #0 {
+; CHECK-LABEL: fptrunc_poison_shuffle_v8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfcvtn v1.4h, v0.4s
+; CHECK-NEXT:    bfcvtn2 v1.8h, v0.4s
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %shuffle = shufflevector <4 x float> %a, <4 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
+  %fpt = fptrunc <8 x float> %shuffle to <8 x bfloat>
+  ret <8 x bfloat> %fpt
+}
+
+define <8 x bfloat> @fptrunc_shuffle_v8bf16(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: fptrunc_shuffle_v8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-NEXT:    bfcvtn2 v0.8h, v1.4s
+; CHECK-NEXT:    ret
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %fpt = fptrunc <8 x float> %shuffle to <8 x bfloat>
+  ret <8 x bfloat> %fpt
+}
+
+define <8 x half> @fptrunc_poison_shuffle_v8f16(<4 x float> %a) #0 {
+; CHECK-LABEL: fptrunc_poison_shuffle_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtn v1.4h, v0.4s
+; CHECK-NEXT:    fcvtn2 v1.8h, v0.4s
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %shuffle = shufflevector <4 x float> %a, <4 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
+  %fpt = fptrunc <8 x float> %shuffle to <8 x half>
+  ret <8 x half> %fpt
+}
+
+define <8 x half> @fptrunc_shuffle_v8f16(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: fptrunc_shuffle_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-NEXT:    ret
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %fpt = fptrunc <8 x float> %shuffle to <8 x half>
+  ret <8 x half> %fpt
+}
+
+attributes #0 = { vscale_range(2,2) "target-features"="+bf16,+sve" }