[llvm] [AArch64] optimise SVE cvt intrinsics with no active lanes (PR #104809)

Thu Aug 29 02:16:51 PDT 2024

https://github.com/Lukacma updated https://github.com/llvm/llvm-project/pull/104809

>From f77215aebf7592a3bc958c56a64b448bfb482136 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Mon, 19 Aug 2024 14:43:04 +0000
Subject: [PATCH 1/4] [AArch64] optimise SVE cvt intrinsics with no active
 lanes

---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  62 +++-
 .../sve-intrinsic-comb-no-active-lanes-cvt.ll | 309 ++++++++++++++++++
 2 files changed, 370 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-cvt.ll

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a782c9c4351237..c546dfb1453492 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1073,6 +1073,32 @@ static bool isAllActivePredicate(Value *Pred) {
                          m_ConstantInt<AArch64SVEPredPattern::all>()));
 }
 
+// Simplify unary operation where predicate has all inactive lanes by replacing
+// instruction with its operand
+static std::optional<Instruction *>
+instCombineSVENoActiveUnaryReplace(InstCombiner &IC, IntrinsicInst &II,
+                                   bool hasInactiveVector) {
+  int PredOperand = hasInactiveVector ? 1 : 0;
+  int ReplaceOperand = hasInactiveVector ? 0 : 1;
+  if (match(II.getOperand(PredOperand), m_ZeroInt())) {
+    IC.replaceInstUsesWith(II, II.getOperand(ReplaceOperand));
+    return IC.eraseInstFromFunction(II);
+  }
+  return std::nullopt;
+}
+
+// Simplify unary operation where predicate has all inactive lanes or try to
+// replace with  _x form when all lanes are active
+static std::optional<Instruction *>
+instCombineSVEAllOrNoActiveUnary(InstCombiner &IC, IntrinsicInst &II) {
+  if (isAllActivePredicate(II.getOperand(1)) &&
+      !isa<llvm::UndefValue>(II.getOperand(0))) {
+    Value *Undef = llvm::UndefValue::get(II.getType());
+    return IC.replaceOperand(II, 0, Undef);
+  }
+  return instCombineSVENoActiveUnaryReplace(IC, II, true);
+}
+
 // Erase unary operation where predicate has all inactive lanes
 static std::optional<Instruction *>
 instCombineSVENoActiveUnaryErase(InstCombiner &IC, IntrinsicInst &II,
@@ -2104,7 +2130,41 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
   switch (IID) {
   default:
     break;
-
+  case Intrinsic::aarch64_sve_fcvt_bf16f32:
+  case Intrinsic::aarch64_sve_fcvt_f16f32:
+  case Intrinsic::aarch64_sve_fcvt_f16f64:
+  case Intrinsic::aarch64_sve_fcvt_f32f16:
+  case Intrinsic::aarch64_sve_fcvt_f32f64:
+  case Intrinsic::aarch64_sve_fcvt_f64f16:
+  case Intrinsic::aarch64_sve_fcvt_f64f32:
+  case Intrinsic::aarch64_sve_fcvtlt_f32f16:
+  case Intrinsic::aarch64_sve_fcvtlt_f64f32:
+  case Intrinsic::aarch64_sve_fcvtnt_bf16f32:
+  case Intrinsic::aarch64_sve_fcvtnt_f16f32:
+  case Intrinsic::aarch64_sve_fcvtnt_f32f64:
+  case Intrinsic::aarch64_sve_fcvtx_f32f64:
+  case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
+  case Intrinsic::aarch64_sve_fcvtzs:
+  case Intrinsic::aarch64_sve_fcvtzs_i32f16:
+  case Intrinsic::aarch64_sve_fcvtzs_i32f64:
+  case Intrinsic::aarch64_sve_fcvtzs_i64f16:
+  case Intrinsic::aarch64_sve_fcvtzs_i64f32:
+  case Intrinsic::aarch64_sve_fcvtzu:
+  case Intrinsic::aarch64_sve_fcvtzu_i32f16:
+  case Intrinsic::aarch64_sve_fcvtzu_i32f64:
+  case Intrinsic::aarch64_sve_fcvtzu_i64f16:
+  case Intrinsic::aarch64_sve_fcvtzu_i64f32:
+  case Intrinsic::aarch64_sve_scvtf:
+  case Intrinsic::aarch64_sve_scvtf_f16i32:
+  case Intrinsic::aarch64_sve_scvtf_f16i64:
+  case Intrinsic::aarch64_sve_scvtf_f32i64:
+  case Intrinsic::aarch64_sve_scvtf_f64i32:
+  case Intrinsic::aarch64_sve_ucvtf:
+  case Intrinsic::aarch64_sve_ucvtf_f16i32:
+  case Intrinsic::aarch64_sve_ucvtf_f16i64:
+  case Intrinsic::aarch64_sve_ucvtf_f32i64:
+  case Intrinsic::aarch64_sve_ucvtf_f64i32:
+    return instCombineSVEAllOrNoActiveUnary(IC, II);
   case Intrinsic::aarch64_sve_st1_scatter:
   case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
   case Intrinsic::aarch64_sve_st1_scatter_sxtw:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-cvt.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-cvt.ll
new file mode 100644
index 00000000000000..eb01ff5dacab56
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-cvt.ll
@@ -0,0 +1,309 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 8 x bfloat> @test_fcvt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: define <vscale x 8 x bfloat> @test_fcvt_bf16_f32(
+; CHECK-SAME: <vscale x 8 x bfloat> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 8 x bfloat> [[A]]
+;
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> zeroinitializer, <vscale x 4 x float> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x half> @test_fcvt_f16_f32(<vscale x 8 x half> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: define <vscale x 8 x half> @test_fcvt_f16_f32(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 8 x half> [[A]]
+;
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f32(<vscale x 8 x half> %a, <vscale x 4 x i1> zeroinitializer, <vscale x 4 x float> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 8 x half> @test_fcvt_f16_f64(<vscale x 8 x half> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: define <vscale x 8 x half> @test_fcvt_f16_f64(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 8 x half> [[A]]
+;
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f64(<vscale x 8 x half> %a, <vscale x 2 x i1> zeroinitializer, <vscale x 2 x double> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @test_fcvt_f32_f16(<vscale x 4 x float> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: define <vscale x 4 x float> @test_fcvt_f32_f16(
+; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 4 x float> [[A]]
+;
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f16(<vscale x 4 x float> %a, <vscale x 4 x i1> zeroinitializer, <vscale x 8 x half> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @test_fcvt_f32_f64(<vscale x 4 x float> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: define <vscale x 4 x float> @test_fcvt_f32_f64(
+; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 4 x float> [[A]]
+;
+  %out = call<vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f64(<vscale x 4 x float> %a, <vscale x 2 x i1> zeroinitializer, <vscale x 2 x double> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @test_fcvt_f64_f16(<vscale x 2 x double> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: define <vscale x 2 x double> @test_fcvt_f64_f16(
+; CHECK-SAME: <vscale x 2 x double> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 2 x double> [[A]]
+;
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f16(<vscale x 2 x double> %a, <vscale x 2 x i1> zeroinitializer, <vscale x 8 x half> %b)
+  ret <vscale x 2 x double> %out
+}
+
+define <vscale x 2 x double> @test_fcvt_f64_f32(<vscale x 2 x double> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: define <vscale x 2 x double> @test_fcvt_f64_f32(
+; CHECK-SAME: <vscale x 2 x double> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 2 x double> [[A]]
+;
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f32(<vscale x 2 x double> %a, <vscale x 2 x i1> zeroinitializer, <vscale x 4 x float> %b)
+  ret <vscale x 2 x double> %out
+}
+
+define <vscale x 4 x float> @test_fcvtlt_f32_f16(<vscale x 4 x float> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: define <vscale x 4 x float> @test_fcvtlt_f32_f16(
+; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 4 x float> [[A]]
+;
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtlt.f32f16(<vscale x 4 x float> %a, <vscale x 4 x i1> zeroinitializer, <vscale x 8 x half> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @test_fcvtlt_f64_f32(<vscale x 2 x double> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: define <vscale x 2 x double> @test_fcvtlt_f64_f32(
+; CHECK-SAME: <vscale x 2 x double> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 2 x double> [[A]]
+;
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fcvtlt.f64f32(<vscale x 2 x double> %a, <vscale x 2 x i1> zeroinitializer, <vscale x 4 x float> %b)
+  ret <vscale x 2 x double> %out
+}
+
+define <vscale x 8 x bfloat> @test_fcvtnt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: define <vscale x 8 x bfloat> @test_fcvtnt_bf16_f32(
+; CHECK-SAME: <vscale x 8 x bfloat> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 8 x bfloat> [[A]]
+;
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvtnt.bf16f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> zeroinitializer, <vscale x 4 x float> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x half> @test_fcvtnt_f16_f32(<vscale x 8 x half> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: define <vscale x 8 x half> @test_fcvtnt_f16_f32(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 8 x half> [[A]]
+;
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fcvtnt.f16f32(<vscale x 8 x half> %a, <vscale x 4 x i1> zeroinitializer, <vscale x 4 x float> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @test_fcvtnt_f32_f64(<vscale x 4 x float> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: define <vscale x 4 x float> @test_fcvtnt_f32_f64(
+; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 4 x float> [[A]]
+;
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtnt.f32f64(<vscale x 4 x float> %a, <vscale x 2 x i1> zeroinitializer, <vscale x 2 x double> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @test_fcvtx_f32_f64(<vscale x 4 x float> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: define <vscale x 4 x float> @test_fcvtx_f32_f64(
+; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 4 x float> [[A]]
+;
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtx.f32f64(<vscale x 4 x float> %a, <vscale x 2 x i1> zeroinitializer, <vscale x 2 x double> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @test_fcvtxnt_f32_f64(<vscale x 4 x float> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: define <vscale x 4 x float> @test_fcvtxnt_f32_f64(
+; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 4 x float> [[A]]
+;
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtxnt.f32f64(<vscale x 4 x float> %a, <vscale x 2 x i1> zeroinitializer, <vscale x 2 x double> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 8 x i16> @test_fcvtzs(<vscale x 8 x i16> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_fcvtzs(
+; CHECK-SAME: <vscale x 8 x i16> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[A]]
+;
+  %out = call <vscale x 8 x i16>  @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32(<vscale x 8 x i16> %a, <vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @test_fcvtzs_i32_f16(<vscale x 4 x i32> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_fcvtzs_i32_f16(
+; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A]]
+;
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f16(<vscale x 4 x i32> %a, <vscale x 4 x i1> zeroinitializer,<vscale x 8 x half> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 4 x i32> @test_fcvtzs_i32_f64(<vscale x 4 x i32> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_fcvtzs_i32_f64(
+; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A]]
+;
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f64(<vscale x 4 x i32> %a, <vscale x 2 x i1> zeroinitializer, <vscale x 2 x double> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @test_fcvtzs_i64_f16(<vscale x 2 x i64> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_fcvtzs_i64_f16(
+; CHECK-SAME: <vscale x 2 x i64> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[A]]
+;
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f16(<vscale x 2 x i64> %a, <vscale x 2 x i1> zeroinitializer,<vscale x 8 x half> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i64> @test_fcvtzs_i64_f32(<vscale x 2 x i64> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_fcvtzs_i64_f32(
+; CHECK-SAME: <vscale x 2 x i64> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[A]]
+;
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f32(<vscale x 2 x i64> %a, <vscale x 2 x i1> zeroinitializer, <vscale x 4 x float> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 8 x i16> @test_fcvtzu(<vscale x 8 x i16> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_fcvtzu(
+; CHECK-SAME: <vscale x 8 x i16> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[A]]
+;
+  %out = call <vscale x 8 x i16>  @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32(<vscale x 8 x i16> %a, <vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @test_fcvtzu_i32_f16(<vscale x 4 x i32> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_fcvtzu_i32_f16(
+; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A]]
+;
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f16(<vscale x 4 x i32> %a, <vscale x 4 x i1> zeroinitializer,<vscale x 8 x half> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 4 x i32> @test_fcvtzu_i32_f64(<vscale x 4 x i32> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_fcvtzu_i32_f64(
+; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A]]
+;
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f64(<vscale x 4 x i32> %a, <vscale x 2 x i1> zeroinitializer, <vscale x 2 x double> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @test_fcvtzu_i64_f16(<vscale x 2 x i64> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_fcvtzu_i64_f16(
+; CHECK-SAME: <vscale x 2 x i64> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[A]]
+;
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f16(<vscale x 2 x i64> %a, <vscale x 2 x i1> zeroinitializer,<vscale x 8 x half> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i64> @test_fcvtzu_i64_f32(<vscale x 2 x i64> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_fcvtzu_i64_f32(
+; CHECK-SAME: <vscale x 2 x i64> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[A]]
+;
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f32(<vscale x 2 x i64> %a, <vscale x 2 x i1> zeroinitializer, <vscale x 4 x float> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 8 x half> @test_scvtf(<vscale x 8 x half> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: define <vscale x 8 x half> @test_scvtf(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 8 x half> [[A]]
+;
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.scvtf.nxv8f16.nxv8i16(<vscale x 8 x half> %a, <vscale x 8 x i1> zeroinitializer, <vscale x 8 x i16> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 8 x half> @test_scvtf_f16_i32(<vscale x 8 x half> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: define <vscale x 8 x half> @test_scvtf_f16_i32(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 8 x half> [[A]]
+;
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.scvtf.f16i32(<vscale x 8 x half> %a, <vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 8 x half> @test_scvtf_f16_i64(<vscale x 8 x half> %a,<vscale x 2 x i64> %b) {
+; CHECK-LABEL: define <vscale x 8 x half> @test_scvtf_f16_i64(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 8 x half> [[A]]
+;
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.scvtf.f16i64(<vscale x 8 x half> %a, <vscale x 2 x i1> zeroinitializer, <vscale x 2 x i64> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @test_scvtf_f32_i64(<vscale x 4 x float> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: define <vscale x 4 x float> @test_scvtf_f32_i64(
+; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 4 x float> [[A]]
+;
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.scvtf.f32i64(<vscale x 4 x float> %a, <vscale x 2 x i1> zeroinitializer, <vscale x 2 x i64> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double>  @test_scvtf_f64_i32(<vscale x 2 x double>  %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: define <vscale x 2 x double> @test_scvtf_f64_i32(
+; CHECK-SAME: <vscale x 2 x double> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 2 x double> [[A]]
+;
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.scvtf.f64i32(<vscale x 2 x double> %a, <vscale x 2 x i1> zeroinitializer, <vscale x 4 x i32> %b)
+  ret <vscale x 2 x double>  %out
+}
+
+define <vscale x 8 x half> @test_ucvtf(<vscale x 8 x half> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: define <vscale x 8 x half> @test_ucvtf(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 8 x half> [[A]]
+;
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.ucvtf.nxv8f16.nxv8i16(<vscale x 8 x half> %a, <vscale x 8 x i1> zeroinitializer, <vscale x 8 x i16> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 8 x half> @test_ucvtf_f16_i32(<vscale x 8 x half> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: define <vscale x 8 x half> @test_ucvtf_f16_i32(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 8 x half> [[A]]
+;
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.ucvtf.f16i32(<vscale x 8 x half> %a, <vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 8 x half> @test_ucvtf_f16_i64(<vscale x 8 x half> %a,<vscale x 2 x i64> %b) {
+; CHECK-LABEL: define <vscale x 8 x half> @test_ucvtf_f16_i64(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 8 x half> [[A]]
+;
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.ucvtf.f16i64(<vscale x 8 x half> %a, <vscale x 2 x i1> zeroinitializer, <vscale x 2 x i64> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @test_ucvtf_f32_i64(<vscale x 4 x float> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: define <vscale x 4 x float> @test_ucvtf_f32_i64(
+; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 4 x float> [[A]]
+;
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.ucvtf.f32i64(<vscale x 4 x float> %a, <vscale x 2 x i1> zeroinitializer, <vscale x 2 x i64> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double>  @test_ucvtf_f64_i32(<vscale x 2 x double>  %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: define <vscale x 2 x double> @test_ucvtf_f64_i32(
+; CHECK-SAME: <vscale x 2 x double> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) {
+; CHECK-NEXT:    ret <vscale x 2 x double> [[A]]
+;
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.ucvtf.f64i32(<vscale x 2 x double> %a, <vscale x 2 x i1> zeroinitializer, <vscale x 4 x i32> %b)
+  ret <vscale x 2 x double>  %out
+}

>From 82cc1e31368711b3c527dfc3399f5516c57d70aa Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Thu, 22 Aug 2024 15:37:47 +0000
Subject: [PATCH 2/4] Fix function naming and tests and added test for all
 active lanes

---
 .../AArch64/AArch64TargetTransformInfo.cpp    |   6 +-
 ...sve-intrinsic-comb-all-active-lanes-cvt.ll | 411 ++++++++++++++++++
 .../sve-intrinsic-comb-no-active-lanes-cvt.ll |   4 +-
 3 files changed, 416 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-all-active-lanes-cvt.ll

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index c546dfb1453492..52ec5d4c0a8c4c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1076,8 +1076,8 @@ static bool isAllActivePredicate(Value *Pred) {
 // Simplify unary operation where predicate has all inactive lanes by replacing
 // instruction with its operand
 static std::optional<Instruction *>
-instCombineSVENoActiveUnaryReplace(InstCombiner &IC, IntrinsicInst &II,
-                                   bool hasInactiveVector) {
+instCombineSVENoActiveReplace(InstCombiner &IC, IntrinsicInst &II,
+                              bool hasInactiveVector) {
   int PredOperand = hasInactiveVector ? 1 : 0;
   int ReplaceOperand = hasInactiveVector ? 0 : 1;
   if (match(II.getOperand(PredOperand), m_ZeroInt())) {
@@ -1096,7 +1096,7 @@ instCombineSVEAllOrNoActiveUnary(InstCombiner &IC, IntrinsicInst &II) {
     Value *Undef = llvm::UndefValue::get(II.getType());
     return IC.replaceOperand(II, 0, Undef);
   }
-  return instCombineSVENoActiveUnaryReplace(IC, II, true);
+  return instCombineSVENoActiveReplace(IC, II, true);
 }
 
 // Erase unary operation where predicate has all inactive lanes
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-all-active-lanes-cvt.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-all-active-lanes-cvt.ll
new file mode 100644
index 00000000000000..40da5c57553351
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-all-active-lanes-cvt.ll
@@ -0,0 +1,411 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 8 x bfloat> @test_fcvt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: define <vscale x 8 x bfloat> @test_fcvt_bf16_f32(
+; CHECK-SAME: <vscale x 8 x bfloat> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32(<vscale x 8 x bfloat> undef, <vscale x 8 x i1> [[PG]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x bfloat> [[OUT]]
+;
+  %pg = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x half> @test_fcvt_f16_f32(<vscale x 8 x half> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: define <vscale x 8 x half> @test_fcvt_f16_f32(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f32(<vscale x 8 x half> undef, <vscale x 4 x i1> [[PG]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[OUT]]
+;
+  %pg = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f32(<vscale x 8 x half> %a, <vscale x 4 x i1> %pg, <vscale x 4 x float> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 8 x half> @test_fcvt_f16_f64(<vscale x 8 x half> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: define <vscale x 8 x half> @test_fcvt_f16_f64(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f64(<vscale x 8 x half> undef, <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[OUT]]
+;
+  %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f64(<vscale x 8 x half> %a, <vscale x 2 x i1> %pg, <vscale x 2 x double> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @test_fcvt_f32_f16(<vscale x 4 x float> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: define <vscale x 4 x float> @test_fcvt_f32_f16(
+; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f16(<vscale x 4 x float> undef, <vscale x 4 x i1> [[PG]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[OUT]]
+;
+  %pg = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f16(<vscale x 4 x float> %a, <vscale x 4 x i1> %pg, <vscale x 8 x half> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @test_fcvt_f32_f64(<vscale x 4 x float> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: define <vscale x 4 x float> @test_fcvt_f32_f64(
+; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f64(<vscale x 4 x float> undef, <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[OUT]]
+;
+  %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call<vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f64(<vscale x 4 x float> %a, <vscale x 2 x i1> %pg, <vscale x 2 x double> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @test_fcvt_f64_f16(<vscale x 2 x double> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: define <vscale x 2 x double> @test_fcvt_f64_f16(
+; CHECK-SAME: <vscale x 2 x double> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f16(<vscale x 2 x double> undef, <vscale x 2 x i1> [[PG]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[OUT]]
+;
+  %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f16(<vscale x 2 x double> %a, <vscale x 2 x i1> %pg, <vscale x 8 x half> %b)
+  ret <vscale x 2 x double> %out
+}
+
+define <vscale x 2 x double> @test_fcvt_f64_f32(<vscale x 2 x double> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: define <vscale x 2 x double> @test_fcvt_f64_f32(
+; CHECK-SAME: <vscale x 2 x double> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f32(<vscale x 2 x double> undef, <vscale x 2 x i1> [[PG]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[OUT]]
+;
+  %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f32(<vscale x 2 x double> %a, <vscale x 2 x i1> %pg, <vscale x 4 x float> %b)
+  ret <vscale x 2 x double> %out
+}
+
+define <vscale x 4 x float> @test_fcvtlt_f32_f16(<vscale x 4 x float> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: define <vscale x 4 x float> @test_fcvtlt_f32_f16(
+; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtlt.f32f16(<vscale x 4 x float> undef, <vscale x 4 x i1> [[PG]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[OUT]]
+;
+  %pg = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtlt.f32f16(<vscale x 4 x float> %a, <vscale x 4 x i1> %pg, <vscale x 8 x half> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @test_fcvtlt_f64_f32(<vscale x 2 x double> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: define <vscale x 2 x double> @test_fcvtlt_f64_f32(
+; CHECK-SAME: <vscale x 2 x double> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fcvtlt.f64f32(<vscale x 2 x double> undef, <vscale x 2 x i1> [[PG]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[OUT]]
+;
+  %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fcvtlt.f64f32(<vscale x 2 x double> %a, <vscale x 2 x i1> %pg, <vscale x 4 x float> %b)
+  ret <vscale x 2 x double> %out
+}
+
+define <vscale x 8 x bfloat> @test_fcvtnt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: define <vscale x 8 x bfloat> @test_fcvtnt_bf16_f32(
+; CHECK-SAME: <vscale x 8 x bfloat> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvtnt.bf16f32(<vscale x 8 x bfloat> undef, <vscale x 8 x i1> [[PG]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x bfloat> [[OUT]]
+;
+  %pg = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvtnt.bf16f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x half> @test_fcvtnt_f16_f32(<vscale x 8 x half> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: define <vscale x 8 x half> @test_fcvtnt_f16_f32(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fcvtnt.f16f32(<vscale x 8 x half> undef, <vscale x 4 x i1> [[PG]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[OUT]]
+;
+  %pg = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fcvtnt.f16f32(<vscale x 8 x half> %a, <vscale x 4 x i1> %pg, <vscale x 4 x float> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @test_fcvtnt_f32_f64(<vscale x 4 x float> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: define <vscale x 4 x float> @test_fcvtnt_f32_f64(
+; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtnt.f32f64(<vscale x 4 x float> undef, <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[OUT]]
+;
+  %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtnt.f32f64(<vscale x 4 x float> %a, <vscale x 2 x i1> %pg, <vscale x 2 x double> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @test_fcvtx_f32_f64(<vscale x 4 x float> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: define <vscale x 4 x float> @test_fcvtx_f32_f64(
+; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtx.f32f64(<vscale x 4 x float> undef, <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[OUT]]
+;
+  %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtx.f32f64(<vscale x 4 x float> %a, <vscale x 2 x i1> %pg, <vscale x 2 x double> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @test_fcvtxnt_f32_f64(<vscale x 4 x float> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: define <vscale x 4 x float> @test_fcvtxnt_f32_f64(
+; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtxnt.f32f64(<vscale x 4 x float> undef, <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[OUT]]
+;
+  %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtxnt.f32f64(<vscale x 4 x float> %a, <vscale x 2 x i1> %pg, <vscale x 2 x double> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 8 x i16> @test_fcvtzs(<vscale x 8 x i16> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_fcvtzs(
+; CHECK-SAME: <vscale x 8 x i16> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16(<vscale x 8 x i16> undef, <vscale x 8 x i1> [[PG]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[OUT]]
+;
+  %pg = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x i16>  @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32(<vscale x 8 x i16> %a, <vscale x 8 x i1> %pg, <vscale x 8 x half> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @test_fcvtzs_i32_f16(<vscale x 4 x i32> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_fcvtzs_i32_f16(
+; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f16(<vscale x 4 x i32> undef, <vscale x 4 x i1> [[PG]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[OUT]]
+;
+  %pg = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f16(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg,<vscale x 8 x half> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 4 x i32> @test_fcvtzs_i32_f64(<vscale x 4 x i32> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_fcvtzs_i32_f64(
+; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f64(<vscale x 4 x i32> undef, <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[OUT]]
+;
+  %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f64(<vscale x 4 x i32> %a, <vscale x 2 x i1> %pg, <vscale x 2 x double> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @test_fcvtzs_i64_f16(<vscale x 2 x i64> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_fcvtzs_i64_f16(
+; CHECK-SAME: <vscale x 2 x i64> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f16(<vscale x 2 x i64> undef, <vscale x 2 x i1> [[PG]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[OUT]]
+;
+  %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f16(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg,<vscale x 8 x half> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i64> @test_fcvtzs_i64_f32(<vscale x 2 x i64> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_fcvtzs_i64_f32(
+; CHECK-SAME: <vscale x 2 x i64> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f32(<vscale x 2 x i64> undef, <vscale x 2 x i1> [[PG]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[OUT]]
+;
+  %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f32(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg, <vscale x 4 x float> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 8 x i16> @test_fcvtzu(<vscale x 8 x i16> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_fcvtzu(
+; CHECK-SAME: <vscale x 8 x i16> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16(<vscale x 8 x i16> undef, <vscale x 8 x i1> [[PG]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[OUT]]
+;
+  %pg = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x i16>  @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32(<vscale x 8 x i16> %a, <vscale x 8 x i1> %pg, <vscale x 8 x half> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @test_fcvtzu_i32_f16(<vscale x 4 x i32> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_fcvtzu_i32_f16(
+; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f16(<vscale x 4 x i32> undef, <vscale x 4 x i1> [[PG]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[OUT]]
+;
+  %pg = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f16(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg,<vscale x 8 x half> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 4 x i32> @test_fcvtzu_i32_f64(<vscale x 4 x i32> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_fcvtzu_i32_f64(
+; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f64(<vscale x 4 x i32> undef, <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[OUT]]
+;
+  %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f64(<vscale x 4 x i32> %a, <vscale x 2 x i1> %pg, <vscale x 2 x double> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @test_fcvtzu_i64_f16(<vscale x 2 x i64> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_fcvtzu_i64_f16(
+; CHECK-SAME: <vscale x 2 x i64> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f16(<vscale x 2 x i64> undef, <vscale x 2 x i1> [[PG]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[OUT]]
+;
+  %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f16(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg,<vscale x 8 x half> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i64> @test_fcvtzu_i64_f32(<vscale x 2 x i64> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_fcvtzu_i64_f32(
+; CHECK-SAME: <vscale x 2 x i64> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f32(<vscale x 2 x i64> undef, <vscale x 2 x i1> [[PG]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[OUT]]
+;
+  %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f32(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg, <vscale x 4 x float> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 8 x half> @test_scvtf(<vscale x 8 x half> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: define <vscale x 8 x half> @test_scvtf(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.scvtf.nxv8f16.nxv8i16(<vscale x 8 x half> undef, <vscale x 8 x i1> [[PG]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[OUT]]
+;
+  %pg = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.scvtf.nxv8f16.nxv8i16(<vscale x 8 x half> %a, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 8 x half> @test_scvtf_f16_i32(<vscale x 8 x half> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: define <vscale x 8 x half> @test_scvtf_f16_i32(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.scvtf.f16i32(<vscale x 8 x half> undef, <vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[OUT]]
+;
+  %pg = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.scvtf.f16i32(<vscale x 8 x half> %a, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 8 x half> @test_scvtf_f16_i64(<vscale x 8 x half> %a,<vscale x 2 x i64> %b) {
+; CHECK-LABEL: define <vscale x 8 x half> @test_scvtf_f16_i64(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.scvtf.f16i64(<vscale x 8 x half> undef, <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[OUT]]
+;
+  %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.scvtf.f16i64(<vscale x 8 x half> %a, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @test_scvtf_f32_i64(<vscale x 4 x float> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: define <vscale x 4 x float> @test_scvtf_f32_i64(
+; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.scvtf.f32i64(<vscale x 4 x float> undef, <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[OUT]]
+;
+  %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.scvtf.f32i64(<vscale x 4 x float> %a, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double>  @test_scvtf_f64_i32(<vscale x 2 x double>  %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: define <vscale x 2 x double> @test_scvtf_f64_i32(
+; CHECK-SAME: <vscale x 2 x double> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.scvtf.f64i32(<vscale x 2 x double> undef, <vscale x 2 x i1> [[PG]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[OUT]]
+;
+  %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.scvtf.f64i32(<vscale x 2 x double> %a, <vscale x 2 x i1> %pg, <vscale x 4 x i32> %b)
+  ret <vscale x 2 x double>  %out
+}
+
+define <vscale x 8 x half> @test_ucvtf(<vscale x 8 x half> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: define <vscale x 8 x half> @test_ucvtf(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.ucvtf.nxv8f16.nxv8i16(<vscale x 8 x half> undef, <vscale x 8 x i1> [[PG]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[OUT]]
+;
+  %pg = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.ucvtf.nxv8f16.nxv8i16(<vscale x 8 x half> %a, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 8 x half> @test_ucvtf_f16_i32(<vscale x 8 x half> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: define <vscale x 8 x half> @test_ucvtf_f16_i32(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.ucvtf.f16i32(<vscale x 8 x half> undef, <vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[OUT]]
+;
+  %pg = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.ucvtf.f16i32(<vscale x 8 x half> %a, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 8 x half> @test_ucvtf_f16_i64(<vscale x 8 x half> %a,<vscale x 2 x i64> %b) {
+; CHECK-LABEL: define <vscale x 8 x half> @test_ucvtf_f16_i64(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.ucvtf.f16i64(<vscale x 8 x half> undef, <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[OUT]]
+;
+  %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.ucvtf.f16i64(<vscale x 8 x half> %a, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @test_ucvtf_f32_i64(<vscale x 4 x float> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: define <vscale x 4 x float> @test_ucvtf_f32_i64(
+; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.ucvtf.f32i64(<vscale x 4 x float> undef, <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[OUT]]
+;
+  %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.ucvtf.f32i64(<vscale x 4 x float> %a, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double>  @test_ucvtf_f64_i32(<vscale x 2 x double>  %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: define <vscale x 2 x double> @test_ucvtf_f64_i32(
+; CHECK-SAME: <vscale x 2 x double> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.ucvtf.f64i32(<vscale x 2 x double> undef, <vscale x 2 x i1> [[PG]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[OUT]]
+;
+  %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.ucvtf.f64i32(<vscale x 2 x double> %a, <vscale x 2 x i1> %pg, <vscale x 4 x i32> %b)
+  ret <vscale x 2 x double>  %out
+}
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-cvt.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-cvt.ll
index eb01ff5dacab56..9b1528eda8ffd4 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-cvt.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-cvt.ll
@@ -133,7 +133,7 @@ define <vscale x 8 x i16> @test_fcvtzs(<vscale x 8 x i16> %a, <vscale x 8 x half
 ; CHECK-SAME: <vscale x 8 x i16> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
 ; CHECK-NEXT:    ret <vscale x 8 x i16> [[A]]
 ;
-  %out = call <vscale x 8 x i16>  @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32(<vscale x 8 x i16> %a, <vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %b)
+  %out = call <vscale x 8 x i16>  @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16(<vscale x 8 x i16> %a, <vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %b)
   ret <vscale x 8 x i16> %out
 }
 
@@ -178,7 +178,7 @@ define <vscale x 8 x i16> @test_fcvtzu(<vscale x 8 x i16> %a, <vscale x 8 x half
 ; CHECK-SAME: <vscale x 8 x i16> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
 ; CHECK-NEXT:    ret <vscale x 8 x i16> [[A]]
 ;
-  %out = call <vscale x 8 x i16>  @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32(<vscale x 8 x i16> %a, <vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %b)
+  %out = call <vscale x 8 x i16>  @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16(<vscale x 8 x i16> %a, <vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %b)
   ret <vscale x 8 x i16> %out
 }
 

>From c6b530e0ce857ce4706643f0554423efa43665d0 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Tue, 27 Aug 2024 11:37:50 +0000
Subject: [PATCH 3/4] Fix comment and add poison value check

---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 52ec5d4c0a8c4c..4f8e60b9e94ad1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1087,12 +1087,13 @@ instCombineSVENoActiveReplace(InstCombiner &IC, IntrinsicInst &II,
   return std::nullopt;
 }
 
-// Simplify unary operation where predicate has all inactive lanes or try to
-// replace with  _x form when all lanes are active
+// Simplify unary operation where predicate has all inactive lanes or
+// replace unused first operand with undef when all lanes are active
 static std::optional<Instruction *>
 instCombineSVEAllOrNoActiveUnary(InstCombiner &IC, IntrinsicInst &II) {
   if (isAllActivePredicate(II.getOperand(1)) &&
-      !isa<llvm::UndefValue>(II.getOperand(0))) {
+      !isa<llvm::UndefValue>(II.getOperand(0)) &&
+      !isa<llvm::PoisonValue>(II.getOperand(0)) {
     Value *Undef = llvm::UndefValue::get(II.getType());
     return IC.replaceOperand(II, 0, Undef);
   }

>From e5340ce53edf1da5b51f1a4c2df963ae726a4db0 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Wed, 28 Aug 2024 14:03:11 +0000
Subject: [PATCH 4/4] add tests for poison/undef and fix intrinsics name

---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  2 +-
 ...sve-intrinsic-comb-all-active-lanes-cvt.ll | 26 ++++++++++++++++++-
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 4f8e60b9e94ad1..e447440713530f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1093,7 +1093,7 @@ static std::optional<Instruction *>
 instCombineSVEAllOrNoActiveUnary(InstCombiner &IC, IntrinsicInst &II) {
   if (isAllActivePredicate(II.getOperand(1)) &&
       !isa<llvm::UndefValue>(II.getOperand(0)) &&
-      !isa<llvm::PoisonValue>(II.getOperand(0)) {
+      !isa<llvm::PoisonValue>(II.getOperand(0))) {
     Value *Undef = llvm::UndefValue::get(II.getType());
     return IC.replaceOperand(II, 0, Undef);
   }
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-all-active-lanes-cvt.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-all-active-lanes-cvt.ll
index 40da5c57553351..374a9851917685 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-all-active-lanes-cvt.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-all-active-lanes-cvt.ll
@@ -2,6 +2,30 @@
 ; RUN: opt -S -passes=instcombine < %s | FileCheck %s
 target triple = "aarch64-unknown-linux-gnu"
 
+define <vscale x 8 x bfloat> @test_fcvt_bf16_f32_undef(<vscale x 8 x bfloat> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: define <vscale x 8 x bfloat> @test_fcvt_bf16_f32_undef(
+; CHECK-SAME: <vscale x 8 x bfloat> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32(<vscale x 8 x bfloat> undef, <vscale x 8 x i1> [[PG]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x bfloat> [[OUT]]
+;
+  %pg = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32(<vscale x 8 x bfloat> undef, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x bfloat> @test_fcvt_bf16_f32_poison(<vscale x 8 x bfloat> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: define <vscale x 8 x bfloat> @test_fcvt_bf16_f32_poison(
+; CHECK-SAME: <vscale x 8 x bfloat> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> [[PG]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x bfloat> [[OUT]]
+;
+  %pg = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
 define <vscale x 8 x bfloat> @test_fcvt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: define <vscale x 8 x bfloat> @test_fcvt_bf16_f32(
 ; CHECK-SAME: <vscale x 8 x bfloat> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
@@ -178,7 +202,7 @@ define <vscale x 8 x i16> @test_fcvtzs(<vscale x 8 x i16> %a, <vscale x 8 x half
 ; CHECK-NEXT:    ret <vscale x 8 x i16> [[OUT]]
 ;
   %pg = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-  %out = call <vscale x 8 x i16>  @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32(<vscale x 8 x i16> %a, <vscale x 8 x i1> %pg, <vscale x 8 x half> %b)
+  %out = call <vscale x 8 x i16>  @llvm.aarch64.sve.fcvtzs.nxv4i16.nxv4f16(<vscale x 8 x i16> %a, <vscale x 8 x i1> %pg, <vscale x 8 x half> %b)
   ret <vscale x 8 x i16> %out
 }