[llvm] [AArch64][SDAG] Add f16 -> i16 rounding NEON conversion intrinsics (PR #155851)

Sat Aug 30 09:35:26 PDT 2025

https://github.com/mrkajetanp updated https://github.com/llvm/llvm-project/pull/155851

>From e0c36f9fbdbfefea25342d93291e1eb1d499fc39 Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski at arm.com>
Date: Wed, 27 Aug 2025 22:11:43 +0000
Subject: [PATCH 1/3] [AArch64][SDAG] Add f16 -> i16 rounding NEON conversion
 intrinsics

Add dedicated .i16.f16 formats for rounding NEON conversion intrinsics
in order to avoid issues with incorrect overflow behaviour caused by
using .i32.f16 formats to perform the same conversions.

Added intrinsic formats:
i16 @llvm.aarch64.neon.fcvtzu.i16.f16(half)
i16 @llvm.aarch64.neon.fcvtzs.i16.f16(half)
i16 @llvm.aarch64.neon.fcvtas.i16.f16(half)
i16 @llvm.aarch64.neon.fcvtms.i16.f16(half)
i16 @llvm.aarch64.neon.fcvtns.i16.f16(half)
i16 @llvm.aarch64.neon.fcvtps.i16.f16(half)

Signed-off-by: Kajetan Puchalski <kajetan.puchalski at arm.com>
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 38 +++++++++
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  2 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  8 ++
 .../AArch64/fp16_i16_intrinsic_scalar.ll      | 80 +++++++++++++++++++
 4 files changed, 128 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/fp16_i16_intrinsic_scalar.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 23328ed57fb36..c135e0acafc49 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1290,6 +1290,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
       setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
       setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
+
+      setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
     } else {
       // when AArch64 doesn't have fullfp16 support, promote the input
       // to i32 first.
@@ -27802,6 +27804,18 @@ void AArch64TargetLowering::ReplaceExtractSubVectorResults(
   Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
 }
 
+void AArch64TargetLowering::ReplaceFcvtFpToI16Intrinsic(
+    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG,
+    unsigned Opcode) const {
+  if (N->getValueType(0).getScalarType() != MVT::i16)
+    return;
+
+  SDLoc DL(N);
+  SDValue CVT = DAG.getNode(Opcode, DL, MVT::f32, N->getOperand(1));
+  SDValue Bitcast = DAG.getBitcast(MVT::i32, CVT);
+  Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Bitcast));
+}
+
 void AArch64TargetLowering::ReplaceGetActiveLaneMaskResults(
     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
   assert((Subtarget->hasSVE2p1() ||
@@ -28292,6 +28306,30 @@ void AArch64TargetLowering::ReplaceNodeResults(
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
       return;
     }
+    case Intrinsic::aarch64_neon_fcvtzs: {
+      ReplaceFcvtFpToI16Intrinsic(N, Results, DAG, AArch64ISD::FCVTZS_HALF);
+      return;
+    }
+    case Intrinsic::aarch64_neon_fcvtzu: {
+      ReplaceFcvtFpToI16Intrinsic(N, Results, DAG, AArch64ISD::FCVTZU_HALF);
+      return;
+    }
+    case Intrinsic::aarch64_neon_fcvtas: {
+      ReplaceFcvtFpToI16Intrinsic(N, Results, DAG, AArch64ISD::FCVTAS_HALF);
+      return;
+    }
+    case Intrinsic::aarch64_neon_fcvtms: {
+      ReplaceFcvtFpToI16Intrinsic(N, Results, DAG, AArch64ISD::FCVTMS_HALF);
+      return;
+    }
+    case Intrinsic::aarch64_neon_fcvtns: {
+      ReplaceFcvtFpToI16Intrinsic(N, Results, DAG, AArch64ISD::FCVTNS_HALF);
+      return;
+    }
+    case Intrinsic::aarch64_neon_fcvtps: {
+      ReplaceFcvtFpToI16Intrinsic(N, Results, DAG, AArch64ISD::FCVTPS_HALF);
+      return;
+    }
     }
   }
   case ISD::READ_REGISTER: {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 46738365080f9..1805875688ddb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -850,6 +850,8 @@ class AArch64TargetLowering : public TargetLowering {
   void ReplaceExtractSubVectorResults(SDNode *N,
                                       SmallVectorImpl<SDValue> &Results,
                                       SelectionDAG &DAG) const;
+  void ReplaceFcvtFpToI16Intrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                                   SelectionDAG &DAG, unsigned Opcode) const;
   void ReplaceGetActiveLaneMaskResults(SDNode *N,
                                        SmallVectorImpl<SDValue> &Results,
                                        SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 07c07008c0e05..24f9a6d00eac0 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -991,6 +991,10 @@ def AArch64fcvtxnv: PatFrags<(ops node:$Rn),
 
 def AArch64fcvtzs_half : SDNode<"AArch64ISD::FCVTZS_HALF", SDTFPExtendOp>;
 def AArch64fcvtzu_half : SDNode<"AArch64ISD::FCVTZU_HALF", SDTFPExtendOp>;
+def AArch64fcvtas_half : SDNode<"AArch64ISD::FCVTAS_HALF", SDTFPExtendOp>;
+def AArch64fcvtms_half : SDNode<"AArch64ISD::FCVTMS_HALF", SDTFPExtendOp>;
+def AArch64fcvtns_half : SDNode<"AArch64ISD::FCVTNS_HALF", SDTFPExtendOp>;
+def AArch64fcvtps_half : SDNode<"AArch64ISD::FCVTPS_HALF", SDTFPExtendOp>;
 
 //def Aarch64softf32tobf16v8: SDNode<"AArch64ISD::", SDTFPRoundOp>;
 
@@ -6550,6 +6554,10 @@ class F16ToI16ScalarPat<SDNode cvt_isd, BaseSIMDTwoScalar instr>
 let Predicates = [HasFullFP16] in {
 def : F16ToI16ScalarPat<AArch64fcvtzs_half, FCVTZSv1f16>;
 def : F16ToI16ScalarPat<AArch64fcvtzu_half, FCVTZUv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtas_half, FCVTASv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtms_half, FCVTMSv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtns_half, FCVTNSv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtps_half, FCVTPSv1f16>;
 }
 
 // Round FP64 to BF16.
diff --git a/llvm/test/CodeGen/AArch64/fp16_i16_intrinsic_scalar.ll b/llvm/test/CodeGen/AArch64/fp16_i16_intrinsic_scalar.ll
new file mode 100644
index 0000000000000..30bc80821ed80
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fp16_i16_intrinsic_scalar.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64 -global-isel=0 -mattr=+v8.2a,+fullfp16  | FileCheck %s
+
+; Test f16 -> i16 NEON intrinics, currently only supported in SDAG.
+; Should be merged with fp16_intrinsic_scalar_1op.ll once there is
+; support in GlSel.
+
+declare i16 @llvm.aarch64.neon.fcvtzu.i16.f16(half)
+declare i16 @llvm.aarch64.neon.fcvtzs.i16.f16(half)
+declare i16 @llvm.aarch64.neon.fcvtas.i16.f16(half)
+declare i16 @llvm.aarch64.neon.fcvtms.i16.f16(half)
+declare i16 @llvm.aarch64.neon.fcvtns.i16.f16(half)
+declare i16 @llvm.aarch64.neon.fcvtps.i16.f16(half)
+
+
+define i16 @fcvtzu_intrinsic_i16(half %a) {
+; CHECK-LABEL: fcvtzu_intrinsic_i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu h0, h0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %fcvt = tail call i16 @llvm.aarch64.neon.fcvtzu.i16.f16(half %a)
+  ret i16 %fcvt
+}
+
+define i16 @fcvtzs_intrinsic_i16(half %a) {
+; CHECK-LABEL: fcvtzs_intrinsic_i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs h0, h0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %fcvt = tail call i16 @llvm.aarch64.neon.fcvtzs.i16.f16(half %a)
+  ret i16 %fcvt
+}
+
+define i16 @fcvtas_intrinsic_i16(half %a) {
+; CHECK-LABEL: fcvtas_intrinsic_i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas h0, h0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %fcvt = tail call i16 @llvm.aarch64.neon.fcvtas.i16.f16(half %a)
+  ret i16 %fcvt
+}
+
+define i16 @fcvtms_intrinsic_i16(half %a) {
+; CHECK-LABEL: fcvtms_intrinsic_i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtms h0, h0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %fcvt = tail call i16 @llvm.aarch64.neon.fcvtms.i16.f16(half %a)
+  ret i16 %fcvt
+}
+
+define i16 @fcvtns_intrinsic_i16(half %a) {
+; CHECK-LABEL: fcvtns_intrinsic_i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtns h0, h0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %fcvt = tail call i16 @llvm.aarch64.neon.fcvtns.i16.f16(half %a)
+  ret i16 %fcvt
+}
+
+define i16 @fcvtps_intrinsic_i16(half %a) {
+; CHECK-LABEL: fcvtps_intrinsic_i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtps h0, h0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %fcvt = tail call i16 @llvm.aarch64.neon.fcvtps.i16.f16(half %a)
+  ret i16 %fcvt
+}

>From f8ea5cdb4cc8203742dffdc9596d28225092d148 Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski at arm.com>
Date: Fri, 29 Aug 2025 13:15:04 +0000
Subject: [PATCH 2/3] Add U-variants, move to performIntrinsicCombine

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 69 +++++++++----------
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  2 -
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  8 +++
 .../AArch64/fp16_i16_intrinsic_scalar.ll      | 66 +++++++++++++++---
 4 files changed, 96 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c135e0acafc49..6a5b64e858373 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1290,8 +1290,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
       setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
       setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
-
-      setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
     } else {
       // when AArch64 doesn't have fullfp16 support, promote the input
       // to i32 first.
@@ -22191,6 +22189,17 @@ static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
                      Zero);
 }
 
+static SDValue tryCombineNeonFcvtFP16ToI16(SDNode *N, unsigned Opcode,
+                                           SelectionDAG &DAG) {
+  if (N->getValueType(0).getScalarType() != MVT::i16)
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue CVT = DAG.getNode(Opcode, DL, MVT::f32, N->getOperand(1));
+  SDValue Bitcast = DAG.getBitcast(MVT::i32, CVT);
+  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Bitcast);
+}
+
 // If a merged operation has no inactive lanes we can relax it to a predicated
 // or unpredicated operation, which potentially allows better isel (perhaps
 // using immediate forms) or relaxing register reuse requirements.
@@ -22444,6 +22453,26 @@ static SDValue performIntrinsicCombine(SDNode *N,
   case Intrinsic::aarch64_neon_uabd:
     return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_neon_fcvtzs:
+    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZS_HALF, DAG);
+  case Intrinsic::aarch64_neon_fcvtzu:
+    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZU_HALF, DAG);
+  case Intrinsic::aarch64_neon_fcvtas:
+    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAS_HALF, DAG);
+  case Intrinsic::aarch64_neon_fcvtau:
+    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAU_HALF, DAG);
+  case Intrinsic::aarch64_neon_fcvtms:
+    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMS_HALF, DAG);
+  case Intrinsic::aarch64_neon_fcvtmu:
+    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMU_HALF, DAG);
+  case Intrinsic::aarch64_neon_fcvtns:
+    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNS_HALF, DAG);
+  case Intrinsic::aarch64_neon_fcvtnu:
+    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNU_HALF, DAG);
+  case Intrinsic::aarch64_neon_fcvtps:
+    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPS_HALF, DAG);
+  case Intrinsic::aarch64_neon_fcvtpu:
+    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPU_HALF, DAG);
   case Intrinsic::aarch64_crc32b:
   case Intrinsic::aarch64_crc32cb:
     return tryCombineCRC32(0xff, N, DAG);
@@ -27804,18 +27833,6 @@ void AArch64TargetLowering::ReplaceExtractSubVectorResults(
   Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
 }
 
-void AArch64TargetLowering::ReplaceFcvtFpToI16Intrinsic(
-    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG,
-    unsigned Opcode) const {
-  if (N->getValueType(0).getScalarType() != MVT::i16)
-    return;
-
-  SDLoc DL(N);
-  SDValue CVT = DAG.getNode(Opcode, DL, MVT::f32, N->getOperand(1));
-  SDValue Bitcast = DAG.getBitcast(MVT::i32, CVT);
-  Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Bitcast));
-}
-
 void AArch64TargetLowering::ReplaceGetActiveLaneMaskResults(
     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
   assert((Subtarget->hasSVE2p1() ||
@@ -28306,30 +28323,6 @@ void AArch64TargetLowering::ReplaceNodeResults(
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
       return;
     }
-    case Intrinsic::aarch64_neon_fcvtzs: {
-      ReplaceFcvtFpToI16Intrinsic(N, Results, DAG, AArch64ISD::FCVTZS_HALF);
-      return;
-    }
-    case Intrinsic::aarch64_neon_fcvtzu: {
-      ReplaceFcvtFpToI16Intrinsic(N, Results, DAG, AArch64ISD::FCVTZU_HALF);
-      return;
-    }
-    case Intrinsic::aarch64_neon_fcvtas: {
-      ReplaceFcvtFpToI16Intrinsic(N, Results, DAG, AArch64ISD::FCVTAS_HALF);
-      return;
-    }
-    case Intrinsic::aarch64_neon_fcvtms: {
-      ReplaceFcvtFpToI16Intrinsic(N, Results, DAG, AArch64ISD::FCVTMS_HALF);
-      return;
-    }
-    case Intrinsic::aarch64_neon_fcvtns: {
-      ReplaceFcvtFpToI16Intrinsic(N, Results, DAG, AArch64ISD::FCVTNS_HALF);
-      return;
-    }
-    case Intrinsic::aarch64_neon_fcvtps: {
-      ReplaceFcvtFpToI16Intrinsic(N, Results, DAG, AArch64ISD::FCVTPS_HALF);
-      return;
-    }
     }
   }
   case ISD::READ_REGISTER: {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 1805875688ddb..46738365080f9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -850,8 +850,6 @@ class AArch64TargetLowering : public TargetLowering {
   void ReplaceExtractSubVectorResults(SDNode *N,
                                       SmallVectorImpl<SDValue> &Results,
                                       SelectionDAG &DAG) const;
-  void ReplaceFcvtFpToI16Intrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                                   SelectionDAG &DAG, unsigned Opcode) const;
   void ReplaceGetActiveLaneMaskResults(SDNode *N,
                                        SmallVectorImpl<SDValue> &Results,
                                        SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 24f9a6d00eac0..c945162f08464 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -992,9 +992,13 @@ def AArch64fcvtxnv: PatFrags<(ops node:$Rn),
 def AArch64fcvtzs_half : SDNode<"AArch64ISD::FCVTZS_HALF", SDTFPExtendOp>;
 def AArch64fcvtzu_half : SDNode<"AArch64ISD::FCVTZU_HALF", SDTFPExtendOp>;
 def AArch64fcvtas_half : SDNode<"AArch64ISD::FCVTAS_HALF", SDTFPExtendOp>;
+def AArch64fcvtau_half : SDNode<"AArch64ISD::FCVTAU_HALF", SDTFPExtendOp>;
 def AArch64fcvtms_half : SDNode<"AArch64ISD::FCVTMS_HALF", SDTFPExtendOp>;
+def AArch64fcvtmu_half : SDNode<"AArch64ISD::FCVTMU_HALF", SDTFPExtendOp>;
 def AArch64fcvtns_half : SDNode<"AArch64ISD::FCVTNS_HALF", SDTFPExtendOp>;
+def AArch64fcvtnu_half : SDNode<"AArch64ISD::FCVTNU_HALF", SDTFPExtendOp>;
 def AArch64fcvtps_half : SDNode<"AArch64ISD::FCVTPS_HALF", SDTFPExtendOp>;
+def AArch64fcvtpu_half : SDNode<"AArch64ISD::FCVTPU_HALF", SDTFPExtendOp>;
 
 //def Aarch64softf32tobf16v8: SDNode<"AArch64ISD::", SDTFPRoundOp>;
 
@@ -6555,9 +6559,13 @@ let Predicates = [HasFullFP16] in {
 def : F16ToI16ScalarPat<AArch64fcvtzs_half, FCVTZSv1f16>;
 def : F16ToI16ScalarPat<AArch64fcvtzu_half, FCVTZUv1f16>;
 def : F16ToI16ScalarPat<AArch64fcvtas_half, FCVTASv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtau_half, FCVTAUv1f16>;
 def : F16ToI16ScalarPat<AArch64fcvtms_half, FCVTMSv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtmu_half, FCVTMUv1f16>;
 def : F16ToI16ScalarPat<AArch64fcvtns_half, FCVTNSv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtnu_half, FCVTNUv1f16>;
 def : F16ToI16ScalarPat<AArch64fcvtps_half, FCVTPSv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtpu_half, FCVTPUv1f16>;
 }
 
 // Round FP64 to BF16.
diff --git a/llvm/test/CodeGen/AArch64/fp16_i16_intrinsic_scalar.ll b/llvm/test/CodeGen/AArch64/fp16_i16_intrinsic_scalar.ll
index 30bc80821ed80..ab502508fadbd 100644
--- a/llvm/test/CodeGen/AArch64/fp16_i16_intrinsic_scalar.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_i16_intrinsic_scalar.ll
@@ -5,33 +5,37 @@
 ; Should be merged with fp16_intrinsic_scalar_1op.ll once there is
 ; support in GlSel.
 
-declare i16 @llvm.aarch64.neon.fcvtzu.i16.f16(half)
 declare i16 @llvm.aarch64.neon.fcvtzs.i16.f16(half)
+declare i16 @llvm.aarch64.neon.fcvtzu.i16.f16(half)
 declare i16 @llvm.aarch64.neon.fcvtas.i16.f16(half)
+declare i16 @llvm.aarch64.neon.fcvtau.i16.f16(half)
 declare i16 @llvm.aarch64.neon.fcvtms.i16.f16(half)
+declare i16 @llvm.aarch64.neon.fcvtmu.i16.f16(half)
 declare i16 @llvm.aarch64.neon.fcvtns.i16.f16(half)
+declare i16 @llvm.aarch64.neon.fcvtnu.i16.f16(half)
 declare i16 @llvm.aarch64.neon.fcvtps.i16.f16(half)
+declare i16 @llvm.aarch64.neon.fcvtpu.i16.f16(half)
 
 
-define i16 @fcvtzu_intrinsic_i16(half %a) {
-; CHECK-LABEL: fcvtzu_intrinsic_i16:
+define i16 @fcvtzs_intrinsic_i16(half %a) {
+; CHECK-LABEL: fcvtzs_intrinsic_i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzu h0, h0
+; CHECK-NEXT:    fcvtzs h0, h0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 entry:
-  %fcvt = tail call i16 @llvm.aarch64.neon.fcvtzu.i16.f16(half %a)
+  %fcvt = tail call i16 @llvm.aarch64.neon.fcvtzs.i16.f16(half %a)
   ret i16 %fcvt
 }
 
-define i16 @fcvtzs_intrinsic_i16(half %a) {
-; CHECK-LABEL: fcvtzs_intrinsic_i16:
+define i16 @fcvtzu_intrinsic_i16(half %a) {
+; CHECK-LABEL: fcvtzu_intrinsic_i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzs h0, h0
+; CHECK-NEXT:    fcvtzu h0, h0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 entry:
-  %fcvt = tail call i16 @llvm.aarch64.neon.fcvtzs.i16.f16(half %a)
+  %fcvt = tail call i16 @llvm.aarch64.neon.fcvtzu.i16.f16(half %a)
   ret i16 %fcvt
 }
 
@@ -46,6 +50,17 @@ entry:
   ret i16 %fcvt
 }
 
+define i16 @fcvtau_intrinsic_i16(half %a) {
+; CHECK-LABEL: fcvtau_intrinsic_i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtau h0, h0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %fcvt = tail call i16 @llvm.aarch64.neon.fcvtau.i16.f16(half %a)
+  ret i16 %fcvt
+}
+
 define i16 @fcvtms_intrinsic_i16(half %a) {
 ; CHECK-LABEL: fcvtms_intrinsic_i16:
 ; CHECK:       // %bb.0: // %entry
@@ -57,6 +72,17 @@ entry:
   ret i16 %fcvt
 }
 
+define i16 @fcvtmu_intrinsic_i16(half %a) {
+; CHECK-LABEL: fcvtmu_intrinsic_i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtmu h0, h0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %fcvt = tail call i16 @llvm.aarch64.neon.fcvtmu.i16.f16(half %a)
+  ret i16 %fcvt
+}
+
 define i16 @fcvtns_intrinsic_i16(half %a) {
 ; CHECK-LABEL: fcvtns_intrinsic_i16:
 ; CHECK:       // %bb.0: // %entry
@@ -68,6 +94,17 @@ entry:
   ret i16 %fcvt
 }
 
+define i16 @fcvtnu_intrinsic_i16(half %a) {
+; CHECK-LABEL: fcvtnu_intrinsic_i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtnu h0, h0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %fcvt = tail call i16 @llvm.aarch64.neon.fcvtnu.i16.f16(half %a)
+  ret i16 %fcvt
+}
+
 define i16 @fcvtps_intrinsic_i16(half %a) {
 ; CHECK-LABEL: fcvtps_intrinsic_i16:
 ; CHECK:       // %bb.0: // %entry
@@ -78,3 +115,14 @@ entry:
   %fcvt = tail call i16 @llvm.aarch64.neon.fcvtps.i16.f16(half %a)
   ret i16 %fcvt
 }
+
+define i16 @fcvtpu_intrinsic_i16(half %a) {
+; CHECK-LABEL: fcvtpu_intrinsic_i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtpu h0, h0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %fcvt = tail call i16 @llvm.aarch64.neon.fcvtpu.i16.f16(half %a)
+  ret i16 %fcvt
+}

>From 63a7c7dff74903ea52023b07fa3da144c797f53d Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski at arm.com>
Date: Sat, 30 Aug 2025 16:34:04 +0000
Subject: [PATCH 3/3] Fix i16 type check, add tests for v4i16.v4f16 formats

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   2 +-
 .../AArch64/fp16_intrinsic_vector_1op.ll      | 123 +++++++++++++++++-
 2 files changed, 120 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6a5b64e858373..b7011e0ea1669 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22191,7 +22191,7 @@ static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
 
 static SDValue tryCombineNeonFcvtFP16ToI16(SDNode *N, unsigned Opcode,
                                            SelectionDAG &DAG) {
-  if (N->getValueType(0).getScalarType() != MVT::i16)
+  if (N->getValueType(0) != MVT::i16)
     return SDValue();
 
   SDLoc DL(N);
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_1op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_1op.ll
index 58cbc2953dbcd..b4fc8971ede8a 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_1op.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_1op.ll
@@ -1,13 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.2a,+fullfp16  | FileCheck %s
 
 declare <4 x half> @llvm.nearbyint.v4f16(<4 x half>)
 declare <8 x half> @llvm.nearbyint.v8f16(<8 x half>)
 declare <4 x half> @llvm.sqrt.v4f16(<4 x half>)
 declare <8 x half> @llvm.sqrt.v8f16(<8 x half>)
+declare <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f16(<4 x half>)
+declare <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half>)
+declare <4 x i16> @llvm.aarch64.neon.fcvtas.v4i16.v4f16(<4 x half>)
+declare <4 x i16> @llvm.aarch64.neon.fcvtau.v4i16.v4f16(<4 x half>)
+declare <4 x i16> @llvm.aarch64.neon.fcvtms.v4i16.v4f16(<4 x half>)
+declare <4 x i16> @llvm.aarch64.neon.fcvtmu.v4i16.v4f16(<4 x half>)
+declare <4 x i16> @llvm.aarch64.neon.fcvtns.v4i16.v4f16(<4 x half>)
+declare <4 x i16> @llvm.aarch64.neon.fcvtnu.v4i16.v4f16(<4 x half>)
+declare <4 x i16> @llvm.aarch64.neon.fcvtps.v4i16.v4f16(<4 x half>)
+declare <4 x i16> @llvm.aarch64.neon.fcvtpu.v4i16.v4f16(<4 x half>)
 
 define dso_local <4 x half> @t_vrndi_f16(<4 x half> %a) {
 ; CHECK-LABEL: t_vrndi_f16:
-; CHECK:         frinti v0.4h, v0.4h
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frinti v0.4h, v0.4h
 ; CHECK-NEXT:    ret
 entry:
   %vrndi1.i = tail call <4 x half> @llvm.nearbyint.v4f16(<4 x half> %a)
@@ -16,7 +28,8 @@ entry:
 
 define dso_local <8 x half> @t_vrndiq_f16(<8 x half> %a) {
 ; CHECK-LABEL: t_vrndiq_f16:
-; CHECK:         frinti v0.8h, v0.8h
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frinti v0.8h, v0.8h
 ; CHECK-NEXT:    ret
 entry:
   %vrndi1.i = tail call <8 x half> @llvm.nearbyint.v8f16(<8 x half> %a)
@@ -25,7 +38,8 @@ entry:
 
 define dso_local <4 x half> @t_vsqrt_f16(<4 x half> %a) {
 ; CHECK-LABEL: t_vsqrt_f16:
-; CHECK:         fsqrt v0.4h, v0.4h
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fsqrt v0.4h, v0.4h
 ; CHECK-NEXT:    ret
 entry:
   %vsqrt.i = tail call <4 x half> @llvm.sqrt.v4f16(<4 x half> %a)
@@ -34,9 +48,110 @@ entry:
 
 define dso_local <8 x half> @t_vsqrtq_f16(<8 x half> %a) {
 ; CHECK-LABEL: t_vsqrtq_f16:
-; CHECK:         fsqrt v0.8h, v0.8h
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fsqrt v0.8h, v0.8h
 ; CHECK-NEXT:    ret
 entry:
   %vsqrt.i = tail call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a)
   ret <8 x half> %vsqrt.i
 }
+
+define <4 x i16> @t_fcvtzs_v4i16_v4f16(<4 x half> %a) {
+; CHECK-LABEL: t_fcvtzs_v4i16_v4f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs v0.4h, v0.4h
+; CHECK-NEXT:    ret
+entry:
+  %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f16(<4 x half> %a)
+  ret <4 x i16> %vcvt
+}
+
+define <4 x i16> @t_fcvtzu_v4i16_v4f16(<4 x half> %a) {
+; CHECK-LABEL: t_fcvtzu_v4i16_v4f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu v0.4h, v0.4h
+; CHECK-NEXT:    ret
+entry:
+  %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half> %a)
+  ret <4 x i16> %vcvt
+}
+
+define <4 x i16> @t_fcvtas_v4i16_v4f16(<4 x half> %a) {
+; CHECK-LABEL: t_fcvtas_v4i16_v4f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas v0.4h, v0.4h
+; CHECK-NEXT:    ret
+entry:
+  %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtas.v4i16.v4f16(<4 x half> %a)
+  ret <4 x i16> %vcvt
+}
+
+define <4 x i16> @t_fcvtau_v4i16_v4f16(<4 x half> %a) {
+; CHECK-LABEL: t_fcvtau_v4i16_v4f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtau v0.4h, v0.4h
+; CHECK-NEXT:    ret
+entry:
+  %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtau.v4i16.v4f16(<4 x half> %a)
+  ret <4 x i16> %vcvt
+}
+
+define <4 x i16> @t_fcvtms_v4i16_v4f16(<4 x half> %a) {
+; CHECK-LABEL: t_fcvtms_v4i16_v4f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtms v0.4h, v0.4h
+; CHECK-NEXT:    ret
+entry:
+  %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtms.v4i16.v4f16(<4 x half> %a)
+  ret <4 x i16> %vcvt
+}
+
+define <4 x i16> @t_fcvtmu_v4i16_v4f16(<4 x half> %a) {
+; CHECK-LABEL: t_fcvtmu_v4i16_v4f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtmu v0.4h, v0.4h
+; CHECK-NEXT:    ret
+entry:
+  %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtmu.v4i16.v4f16(<4 x half> %a)
+  ret <4 x i16> %vcvt
+}
+
+define <4 x i16> @t_fcvtns_v4i16_v4f16(<4 x half> %a) {
+; CHECK-LABEL: t_fcvtns_v4i16_v4f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtns v0.4h, v0.4h
+; CHECK-NEXT:    ret
+entry:
+  %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtns.v4i16.v4f16(<4 x half> %a)
+  ret <4 x i16> %vcvt
+}
+
+define <4 x i16> @t_fcvtnu_v4i16_v4f16(<4 x half> %a) {
+; CHECK-LABEL: t_fcvtnu_v4i16_v4f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtnu v0.4h, v0.4h
+; CHECK-NEXT:    ret
+entry:
+  %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtnu.v4i16.v4f16(<4 x half> %a)
+  ret <4 x i16> %vcvt
+}
+
+define <4 x i16> @t_fcvtps_v4i16_v4f16(<4 x half> %a) {
+; CHECK-LABEL: t_fcvtps_v4i16_v4f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtps v0.4h, v0.4h
+; CHECK-NEXT:    ret
+entry:
+  %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtps.v4i16.v4f16(<4 x half> %a)
+  ret <4 x i16> %vcvt
+}
+
+define <4 x i16> @t_fcvtpu_v4i16_v4f16(<4 x half> %a) {
+; CHECK-LABEL: t_fcvtpu_v4i16_v4f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtpu v0.4h, v0.4h
+; CHECK-NEXT:    ret
+entry:
+  %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtpu.v4i16.v4f16(<4 x half> %a)
+  ret <4 x i16> %vcvt
+}