[clang] [llvm] [SME] Add intrinsics for FCVT(wid.) and FCVTL (PR #90215)

Tue Apr 30 06:13:10 PDT 2024

https://github.com/Lukacma updated https://github.com/llvm/llvm-project/pull/90215

>From a1750b2b5658f8ced700bbf010019703fc52f126 Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Mon, 15 Apr 2024 13:31:00 +0000
Subject: [PATCH 1/6] [LLVM][AARCH64]Replace +sme2p1+smef16f16 by +smef16f16

According to the latest ISA Spec release[1] all instructions under:
 HasSME2p1 and HasSMEF16F16
should now only require:
HasSMEF16F16

[1]https://developer.arm.com
---
 llvm/test/MC/AArch64/SME2p1/fadd.s             | 8 ++++++++
 llvm/test/MC/AArch64/SME2p1/fmla-diagnostics.s | 2 +-
 llvm/test/MC/AArch64/SME2p1/fmls-diagnostics.s | 2 +-
 llvm/test/MC/AArch64/SME2p1/fsub.s             | 8 ++++++++
 4 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/llvm/test/MC/AArch64/SME2p1/fadd.s b/llvm/test/MC/AArch64/SME2p1/fadd.s
index bdb769093c8388..ec4f27e021a000 100644
--- a/llvm/test/MC/AArch64/SME2p1/fadd.s
+++ b/llvm/test/MC/AArch64/SME2p1/fadd.s
@@ -1,16 +1,24 @@
 // RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
 // RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
 // RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
 // RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
 // RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fadd    za.h[w8, 0, vgx2], {z0.h, z1.h}  // 11000001-10100100-00011100-00000000
diff --git a/llvm/test/MC/AArch64/SME2p1/fmla-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fmla-diagnostics.s
index 2f0dccb57c9076..c31b54fc05deaf 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmla-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmla-diagnostics.s
@@ -66,7 +66,7 @@ fmla za.h[w8, 8, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
 // Invalid Register Suffix
 
 fmla za.d[w8, 7, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .h
 // CHECK-NEXT: fmla za.d[w8, 7, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
diff --git a/llvm/test/MC/AArch64/SME2p1/fmls-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fmls-diagnostics.s
index 3ff09321e3436b..2deb18186eafca 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmls-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmls-diagnostics.s
@@ -66,7 +66,7 @@ fmls za.h[w8, 8, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
 // Invalid Register Suffix
 
 fmls za.d[w8, 7, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .h
 // CHECK-NEXT: fmls za.d[w8, 7, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
diff --git a/llvm/test/MC/AArch64/SME2p1/fsub.s b/llvm/test/MC/AArch64/SME2p1/fsub.s
index 66410008eb11d1..e42a819e0d4150 100644
--- a/llvm/test/MC/AArch64/SME2p1/fsub.s
+++ b/llvm/test/MC/AArch64/SME2p1/fsub.s
@@ -1,16 +1,24 @@
 // RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
 // RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
 // RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
 // RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
 // RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 

>From 1dab277b62d14163af243cfe608ad43dbe687a45 Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Thu, 18 Apr 2024 13:53:36 +0000
Subject: [PATCH 2/6] Address review comments

---
 llvm/include/llvm/TargetParser/AArch64TargetParser.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 0d1cfd152151aa..0b4fb0ad773ec7 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -303,6 +303,7 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"lut", AArch64::AEK_LUT, "+lut", "-lut", FEAT_INIT, "", 0},
     {"sme-lutv2", AArch64::AEK_SME_LUTv2, "+sme-lutv2", "-sme-lutv2", FEAT_INIT, "", 0},
     {"sme-f8f16", AArch64::AEK_SMEF8F16, "+sme-f8f16", "-sme-f8f16", FEAT_INIT, "+fp8,+sme2", 0},
+    {"sme-f8f16", AArch64::AEK_SMEF8F16, "+sme-f8f16", "-sme-f8f16", FEAT_INIT, "+fp8,+sme2", 0},
     {"sme-f8f32", AArch64::AEK_SMEF8F32, "+sme-f8f32", "-sme-f8f32", FEAT_INIT, "+sme2,+fp8", 0},
     {"sme-fa64",  AArch64::AEK_SMEFA64,  "+sme-fa64", "-sme-fa64",  FEAT_INIT, "", 0},
     {"cpa", AArch64::AEK_CPA, "+cpa", "-cpa", FEAT_INIT, "", 0},

>From 176083b8562ef5f6b265ed14a3d4f81e4555ee6e Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Fri, 19 Apr 2024 16:03:03 +0000
Subject: [PATCH 3/6] Fix MC tests

---
 llvm/test/MC/AArch64/SME2p1/fmla-diagnostics.s | 2 +-
 llvm/test/MC/AArch64/SME2p1/fmls-diagnostics.s | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/MC/AArch64/SME2p1/fmla-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fmla-diagnostics.s
index c31b54fc05deaf..2f0dccb57c9076 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmla-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmla-diagnostics.s
@@ -66,7 +66,7 @@ fmla za.h[w8, 8, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
 // Invalid Register Suffix
 
 fmla za.d[w8, 7, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .s
 // CHECK-NEXT: fmla za.d[w8, 7, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
diff --git a/llvm/test/MC/AArch64/SME2p1/fmls-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fmls-diagnostics.s
index 2deb18186eafca..3ff09321e3436b 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmls-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmls-diagnostics.s
@@ -66,7 +66,7 @@ fmls za.h[w8, 8, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
 // Invalid Register Suffix
 
 fmls za.d[w8, 7, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .s
 // CHECK-NEXT: fmls za.d[w8, 7, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 

>From 8cfdd4f0e317646f43ed18a08f2b5f40eafae129 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Fri, 26 Apr 2024 13:58:47 +0000
Subject: [PATCH 4/6] [SME] Add intrinsics for FCVT(wid.) and FCVTL

---
 clang/include/clang/Basic/arm_sve.td          | 11 +++++
 .../aarch64-sme2-intrinsics/acle_sme2_cvt.c   | 22 ++++++++++
 .../aarch64-sme2-intrinsics/acle_sme2_cvtl.c  | 40 +++++++++++++++++++
 .../aarch64-sme2-intrinsics/acle_sme2_cvtl.s  | 27 +++++++++++++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     | 14 ++++++-
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |  6 +++
 .../CodeGen/AArch64/sme2-intrinsics-cvt.ll    | 11 ++++-
 .../CodeGen/AArch64/sme2-intrinsics-cvtl.ll   | 11 +++++
 8 files changed, 140 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c
 create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.s
 create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 15340ebb62b365..e809b4a57d6543 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2265,6 +2265,10 @@ let TargetGuard = "sme2" in {
   def SVCVT_S32_F32_X4 : SInst<"svcvt_{d}[_f32_x4]", "4.d4.M", "i",  MergeNone, "aarch64_sve_fcvtzs_x4", [IsStreaming, IsOverloadWhileOrMultiVecCvt], []>;
 }
 
+let TargetGuard = "sme-f16f16" in {
+  def SVCVT_F32_X2 : SInst<"svcvt_{d}[_f16_x2]", "2h", "f", MergeNone, "aarch64_sve_fcvt_widen_x2", [ IsStreaming],[]>;
+} 
+
 //
 // Multi-vector floating-point convert from single-precision to interleaved half-precision/BFloat16
 //
@@ -2273,6 +2277,13 @@ let TargetGuard = "sme2" in {
   def SVCVTN_BF16_X2 : SInst<"svcvtn_bf16[_f32_x2]", "$2", "f", MergeNone, "aarch64_sve_bfcvtn_x2", [IsOverloadNone, IsStreaming],[]>;
 }
 
+//
+//Multi-vector floating-point convert from half-precision to deinterleaved single-precision.
+//
+let TargetGuard = "sme-f16f16" in {
+  def SVCVTL_F32_X2 : SInst<"svcvtl_f32[_f16_x2]", "2h", "f", MergeNone, "aarch64_sve_fcvtl_widen_x2", [ IsStreaming],[]>;
+} 
+
 //
 // Multi-vector saturating extract narrow
 //
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
index 79a11c2ec153e4..d117a685bfc290 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
@@ -497,3 +497,25 @@ svuint8_t test_qcvt_u8_s32_x4(svint32x4_t zn) __arm_streaming {
 svuint16_t test_qcvt_u16_s64_x4(svint64x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svqcvt_u16,_s64_x4,,)(zn);
 }
+
+// CHECK-LABEL: @test_cvt_f32_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvt.widen.x2.nxv4f32(<vscale x 8 x half> [[ZN:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z15test_cvt_f32_x2u13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvt.widen.x2.nxv4f32(<vscale x 8 x half> [[ZN:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+__attribute__((target("sme-f16f16"))) svfloat32x2_t test_cvt_f32_x2(svfloat16_t zn)  __arm_streaming {
+  return SVE_ACLE_FUNC(svcvt_f32,_f16_x2,,)(zn);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c
new file mode 100644
index 00000000000000..1142065614b8ff
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c
@@ -0,0 +1,40 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+// CHECK-LABEL: @test_cvtl_f32_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvtl.widen.x2.nxv4f32(<vscale x 8 x half> [[ZN:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z16test_cvtl_f32_x2u13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvtl.widen.x2.nxv4f32(<vscale x 8 x half> [[ZN:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+svfloat32x2_t test_cvtl_f32_x2(svfloat16_t zn)  __arm_streaming {
+  return SVE_ACLE_FUNC(svcvtl_f32,_f16_x2,,)(zn);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.s b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.s
new file mode 100644
index 00000000000000..55078ec0f96457
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.s
@@ -0,0 +1,27 @@
+	.text
+	.file	"acle_sme2_cvtl.c"
+	.globl	test_cvtl_f32_x2                // -- Begin function test_cvtl_f32_x2
+	.p2align	2
+	.type	test_cvtl_f32_x2, at function
+	.variant_pcs	test_cvtl_f32_x2
+test_cvtl_f32_x2:                       // @test_cvtl_f32_x2
+.Ltest_cvtl_f32_x2$local:
+	.type	.Ltest_cvtl_f32_x2$local, at function
+// %bb.0:                               // %entry
+	str	x29, [sp, #-16]!                // 8-byte Folded Spill
+	addvl	sp, sp, #-1
+	ptrue	p0.h
+	st1h	{ z0.h }, p0, [sp]
+	ld1h	{ z0.h }, p0/z, [sp]
+	fcvtl	{ z2.s, z3.s }, z0.h
+	mov	z0.d, z2.d
+	mov	z1.d, z3.d
+	addvl	sp, sp, #1
+	ldr	x29, [sp], #16                  // 8-byte Folded Reload
+	ret
+.Lfunc_end0:
+	.size	test_cvtl_f32_x2, .Lfunc_end0-test_cvtl_f32_x2
+	.size	.Ltest_cvtl_f32_x2$local, .Lfunc_end0-test_cvtl_f32_x2
+                                        // -- End function
+	.ident	"clang version 19.0.0git (git at github.com:Lukacma/llvm-project.git 176083b8562ef5f6b265ed14a3d4f81e4555ee6e)"
+	.section	".note.GNU-stack","", at progbits
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index e31e00a9c76f31..7b8eeafec597bf 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3121,6 +3121,11 @@ let TargetPrefix = "aarch64" in {
     : DefaultAttrsIntrinsic<[llvm_nxv8bf16_ty],
                             [llvm_nxv4f32_ty, llvm_nxv4f32_ty],
                             [IntrNoMem]>;
+  
+  class SME2_CVT_WIDENING_VG2_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                            [LLVMSubdivide2VectorType<0>], [IntrNoMem]>;
+  
 
   class SME2_CVT_VG4_SINGLE_Intrinsic
     : DefaultAttrsIntrinsic<[LLVMSubdivide4VectorType<0>],
@@ -3412,6 +3417,13 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_suvdot_lane_za32_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic;
   def int_aarch64_sme_usvdot_lane_za32_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic;
 
+
+  //
+  //Multi-vector floating-point convert from half-precision to deinterleaved single-precision.
+  //
+  
+  def int_aarch64_sve_fcvtl_widen_x2  : SME2_CVT_WIDENING_VG2_Intrinsic;
+
   //
   // Multi-vector floating-point CVT from single-precision to interleaved half-precision/BFloat16
   //
@@ -3431,7 +3443,7 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sve_fcvtzu_x4 : SME2_CVT_X4_Intrinsic;
   def int_aarch64_sve_scvtf_x4  : SME2_CVT_X4_Intrinsic;
   def int_aarch64_sve_ucvtf_x4  : SME2_CVT_X4_Intrinsic;
-
+  def int_aarch64_sve_fcvt_widen_x2 : SME2_CVT_WIDENING_VG2_Intrinsic;
   //
   // Multi-vector saturating extract narrow
   //
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 80272213dd3897..6db04c37e8a422 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -5713,6 +5713,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::aarch64_sve_ucvtf_x4:
       SelectCVTIntrinsic(Node, 4, AArch64::UCVTF_4Z4Z_StoS);
       return;
+    case Intrinsic::aarch64_sve_fcvt_widen_x2:
+      SelectUnaryMultiIntrinsic(Node, 2, false, AArch64::FCVT_2ZZ_H_S);
+      return;
+    case Intrinsic::aarch64_sve_fcvtl_widen_x2:
+      SelectUnaryMultiIntrinsic(Node, 2, false, AArch64::FCVTL_2ZZ_H_S);
+      return;
     case Intrinsic::aarch64_sve_sclamp_single_x2:
       if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
               Node->getValueType(0),
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll
index bc1db878cbd313..611cdcda157e21 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme-f16f16 -verify-machineinstrs < %s | FileCheck %s
 
 ;
 ; FCVT
@@ -139,6 +139,15 @@ define {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale
   ret {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} %res
 }
 
+define {<vscale x 4 x float>, <vscale x 4 x float>}  @multi_vector_cvt_widen_x2_f16(<vscale x 8 x half> %zn0) {
+; CHECK-LABEL: multi_vector_cvt_widen_x2_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvt { z0.s, z1.s }, z0.h
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvt.widen.x2.nxv4f32(<vscale x 8 x half> %zn0)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
+}
+
 declare <vscale x 8 x half> @llvm.aarch64.sve.fcvt.x2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.bfcvt.x2(<vscale x 4 x float>, <vscale x 4 x float>)
 declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvtzs.x2.nxv4i32.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll
new file mode 100644
index 00000000000000..30dc7cbfaea6c9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll
@@ -0,0 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme-f16f16 -verify-machineinstrs < %s | FileCheck %s
+
+define {<vscale x 4 x float>, <vscale x 4 x float>}  @multi_vector_cvtl_widen_x2_f16(<vscale x 8 x half> %zn0) {
+; CHECK-LABEL: multi_vector_cvtl_widen_x2_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl { z0.s, z1.s }, z0.h
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvtl.widen.x2.nxv4f32(<vscale x 8 x half> %zn0)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
+}

>From bb90148de759c0e2113532db8dac1f3f4cd04c39 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Fri, 26 Apr 2024 14:46:05 +0000
Subject: [PATCH 5/6] Revert incorrect changes

---
 .../aarch64-sme2-intrinsics/acle_sme2_cvtl.s  | 27 -------------------
 .../llvm/TargetParser/AArch64TargetParser.h   |  1 -
 llvm/test/MC/AArch64/SME2p1/fadd.s            |  8 ------
 llvm/test/MC/AArch64/SME2p1/fsub.s            |  8 ------
 4 files changed, 44 deletions(-)
 delete mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.s

diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.s b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.s
deleted file mode 100644
index 55078ec0f96457..00000000000000
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.s
+++ /dev/null
@@ -1,27 +0,0 @@
-	.text
-	.file	"acle_sme2_cvtl.c"
-	.globl	test_cvtl_f32_x2                // -- Begin function test_cvtl_f32_x2
-	.p2align	2
-	.type	test_cvtl_f32_x2, at function
-	.variant_pcs	test_cvtl_f32_x2
-test_cvtl_f32_x2:                       // @test_cvtl_f32_x2
-.Ltest_cvtl_f32_x2$local:
-	.type	.Ltest_cvtl_f32_x2$local, at function
-// %bb.0:                               // %entry
-	str	x29, [sp, #-16]!                // 8-byte Folded Spill
-	addvl	sp, sp, #-1
-	ptrue	p0.h
-	st1h	{ z0.h }, p0, [sp]
-	ld1h	{ z0.h }, p0/z, [sp]
-	fcvtl	{ z2.s, z3.s }, z0.h
-	mov	z0.d, z2.d
-	mov	z1.d, z3.d
-	addvl	sp, sp, #1
-	ldr	x29, [sp], #16                  // 8-byte Folded Reload
-	ret
-.Lfunc_end0:
-	.size	test_cvtl_f32_x2, .Lfunc_end0-test_cvtl_f32_x2
-	.size	.Ltest_cvtl_f32_x2$local, .Lfunc_end0-test_cvtl_f32_x2
-                                        // -- End function
-	.ident	"clang version 19.0.0git (git at github.com:Lukacma/llvm-project.git 176083b8562ef5f6b265ed14a3d4f81e4555ee6e)"
-	.section	".note.GNU-stack","", at progbits
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 0b4fb0ad773ec7..0d1cfd152151aa 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -303,7 +303,6 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"lut", AArch64::AEK_LUT, "+lut", "-lut", FEAT_INIT, "", 0},
     {"sme-lutv2", AArch64::AEK_SME_LUTv2, "+sme-lutv2", "-sme-lutv2", FEAT_INIT, "", 0},
     {"sme-f8f16", AArch64::AEK_SMEF8F16, "+sme-f8f16", "-sme-f8f16", FEAT_INIT, "+fp8,+sme2", 0},
-    {"sme-f8f16", AArch64::AEK_SMEF8F16, "+sme-f8f16", "-sme-f8f16", FEAT_INIT, "+fp8,+sme2", 0},
     {"sme-f8f32", AArch64::AEK_SMEF8F32, "+sme-f8f32", "-sme-f8f32", FEAT_INIT, "+sme2,+fp8", 0},
     {"sme-fa64",  AArch64::AEK_SMEFA64,  "+sme-fa64", "-sme-fa64",  FEAT_INIT, "", 0},
     {"cpa", AArch64::AEK_CPA, "+cpa", "-cpa", FEAT_INIT, "", 0},
diff --git a/llvm/test/MC/AArch64/SME2p1/fadd.s b/llvm/test/MC/AArch64/SME2p1/fadd.s
index ec4f27e021a000..bdb769093c8388 100644
--- a/llvm/test/MC/AArch64/SME2p1/fadd.s
+++ b/llvm/test/MC/AArch64/SME2p1/fadd.s
@@ -1,24 +1,16 @@
 // RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16 < %s \
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
-// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
 // RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
 // RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
 // RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
 // RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fadd    za.h[w8, 0, vgx2], {z0.h, z1.h}  // 11000001-10100100-00011100-00000000
diff --git a/llvm/test/MC/AArch64/SME2p1/fsub.s b/llvm/test/MC/AArch64/SME2p1/fsub.s
index e42a819e0d4150..66410008eb11d1 100644
--- a/llvm/test/MC/AArch64/SME2p1/fsub.s
+++ b/llvm/test/MC/AArch64/SME2p1/fsub.s
@@ -1,24 +1,16 @@
 // RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16 < %s \
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
-// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
 // RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
 // RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
 // RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
 // RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 

>From 71b4f03abf85e7612e9a157af06f76ad575ec648 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Tue, 30 Apr 2024 13:12:50 +0000
Subject: [PATCH 6/6] removed trailing whitespace

---
 clang/include/clang/Basic/arm_sve.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index e809b4a57d6543..af0562ad870808 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2267,7 +2267,7 @@ let TargetGuard = "sme2" in {
 
 let TargetGuard = "sme-f16f16" in {
   def SVCVT_F32_X2 : SInst<"svcvt_{d}[_f16_x2]", "2h", "f", MergeNone, "aarch64_sve_fcvt_widen_x2", [ IsStreaming],[]>;
-} 
+}
 
 //
 // Multi-vector floating-point convert from single-precision to interleaved half-precision/BFloat16
@@ -2282,7 +2282,7 @@ let TargetGuard = "sme2" in {
 //
 let TargetGuard = "sme-f16f16" in {
   def SVCVTL_F32_X2 : SInst<"svcvtl_f32[_f16_x2]", "2h", "f", MergeNone, "aarch64_sve_fcvtl_widen_x2", [ IsStreaming],[]>;
-} 
+}
 
 //
 // Multi-vector saturating extract narrow