[llvm] [AArch64][SVE] Combine UXT[BHW] intrinsics to AND. (PR #137956)

Wed Apr 30 08:17:01 PDT 2025

https://github.com/rj-jesus updated https://github.com/llvm/llvm-project/pull/137956

>From ed0ee9aba8f4e13d97cf606a683c7e5c13a5b77c Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Wed, 30 Apr 2025 02:21:38 -0700
Subject: [PATCH 1/3] Precommit tests.

---
 .../InstCombine/AArch64/sve-intrinsic-uxt.ll  | 342 ++++++++++++++++++
 1 file changed, 342 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-uxt.ll

diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-uxt.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-uxt.ll
new file mode 100644
index 0000000000000..e194d6346e64e
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-uxt.ll
@@ -0,0 +1,342 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 2 x i64> @uxtb_z_64(<vscale x 2 x i64> %0) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @uxtb_z_64(
+; CHECK-SAME: <vscale x 2 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP0]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+;
+  %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> %0)
+  ret <vscale x 2 x i64> %2
+}
+
+define <vscale x 2 x i64> @uxtb_m_64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @uxtb_m_64(
+; CHECK-SAME: <vscale x 2 x i64> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP0]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+;
+  %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> %1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> %0)
+  ret <vscale x 2 x i64> %3
+}
+
+define <vscale x 2 x i64> @uxtb_x_64(<vscale x 16 x i1> %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @uxtb_x_64(
+; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP4]]
+;
+  %3 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
+  %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %3, <vscale x 2 x i64> %1)
+  ret <vscale x 2 x i64> %4
+}
+
+define <vscale x 2 x i64> @uxtb_z_64_no_ptrue(<vscale x 16 x i1> %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @uxtb_z_64_no_ptrue(
+; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP4]]
+;
+  %3 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
+  %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %3, <vscale x 2 x i64> %1)
+  ret <vscale x 2 x i64> %4
+}
+
+define <vscale x 2 x i64> @uxtb_m_64_no_ptrue(<vscale x 16 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @uxtb_m_64_no_ptrue(
+; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]], <vscale x 2 x i64> [[TMP2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP4]], <vscale x 2 x i64> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
+;
+  %4 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
+  %5 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> %2, <vscale x 2 x i1> %4, <vscale x 2 x i64> %1)
+  ret <vscale x 2 x i64> %5
+}
+
+define <vscale x 4 x i32> @uxtb_z_32(<vscale x 4 x i32> %0) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @uxtb_z_32(
+; CHECK-SAME: <vscale x 4 x i32> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP0]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+;
+  %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> %0)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @uxtb_m_32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @uxtb_m_32(
+; CHECK-SAME: <vscale x 4 x i32> [[TMP0:%.*]], <vscale x 4 x i32> [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP0]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+;
+  %3 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> %1, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> %0)
+  ret <vscale x 4 x i32> %3
+}
+
+define <vscale x 4 x i32> @uxtb_x_32(<vscale x 16 x i1> %0, <vscale x 4 x i32> %1) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @uxtb_x_32(
+; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 4 x i32> [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP4]]
+;
+  %3 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %0)
+  %4 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %3, <vscale x 4 x i32> %1)
+  ret <vscale x 4 x i32> %4
+}
+
+define <vscale x 4 x i32> @uxtb_z_32_no_ptrue(<vscale x 16 x i1> %0, <vscale x 4 x i32> %1) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @uxtb_z_32_no_ptrue(
+; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 4 x i32> [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP4]]
+;
+  %3 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %0)
+  %4 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %3, <vscale x 4 x i32> %1)
+  ret <vscale x 4 x i32> %4
+}
+
+define <vscale x 4 x i32> @uxtb_m_32_no_ptrue(<vscale x 16 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @uxtb_m_32_no_ptrue(
+; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 4 x i32> [[TMP1:%.*]], <vscale x 4 x i32> [[TMP2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP4]], <vscale x 4 x i32> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
+;
+  %4 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %0)
+  %5 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> %2, <vscale x 4 x i1> %4, <vscale x 4 x i32> %1)
+  ret <vscale x 4 x i32> %5
+}
+
+define <vscale x 8 x i16> @uxtb_z_16(<vscale x 8 x i16> %0) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @uxtb_z_16(
+; CHECK-SAME: <vscale x 8 x i16> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[TMP0]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+;
+  %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> %0)
+  ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 8 x i16> @uxtb_m_16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @uxtb_m_16(
+; CHECK-SAME: <vscale x 8 x i16> [[TMP0:%.*]], <vscale x 8 x i16> [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[TMP0]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP3]]
+;
+  %3 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> %1, <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> %0)
+  ret <vscale x 8 x i16> %3
+}
+
+define <vscale x 8 x i16> @uxtb_x_16(<vscale x 16 x i1> %0, <vscale x 8 x i16> %1) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @uxtb_x_16(
+; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 8 x i16> [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP4]]
+;
+  %3 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %0)
+  %4 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %3, <vscale x 8 x i16> %1)
+  ret <vscale x 8 x i16> %4
+}
+
+define <vscale x 8 x i16> @uxtb_z_16_no_ptrue(<vscale x 16 x i1> %0, <vscale x 8 x i16> %1) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @uxtb_z_16_no_ptrue(
+; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 8 x i16> [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP4]]
+;
+  %3 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %0)
+  %4 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> %3, <vscale x 8 x i16> %1)
+  ret <vscale x 8 x i16> %4
+}
+
+define <vscale x 8 x i16> @uxtb_m_16_no_ptrue(<vscale x 16 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @uxtb_m_16_no_ptrue(
+; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 8 x i16> [[TMP1:%.*]], <vscale x 8 x i16> [[TMP2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP4]], <vscale x 8 x i16> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP5]]
+;
+  %4 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %0)
+  %5 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> %2, <vscale x 8 x i1> %4, <vscale x 8 x i16> %1)
+  ret <vscale x 8 x i16> %5
+}
+
+define <vscale x 2 x i64> @uxth_z_64(<vscale x 2 x i64> %0) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @uxth_z_64(
+; CHECK-SAME: <vscale x 2 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP0]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+;
+  %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> %0)
+  ret <vscale x 2 x i64> %2
+}
+
+define <vscale x 2 x i64> @uxth_m_64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @uxth_m_64(
+; CHECK-SAME: <vscale x 2 x i64> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP0]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+;
+  %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> %1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> %0)
+  ret <vscale x 2 x i64> %3
+}
+
+define <vscale x 2 x i64> @uxth_x_64(<vscale x 16 x i1> %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @uxth_x_64(
+; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP4]]
+;
+  %3 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
+  %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %3, <vscale x 2 x i64> %1)
+  ret <vscale x 2 x i64> %4
+}
+
+define <vscale x 2 x i64> @uxth_z_64_no_ptrue(<vscale x 16 x i1> %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @uxth_z_64_no_ptrue(
+; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP4]]
+;
+  %3 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
+  %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %3, <vscale x 2 x i64> %1)
+  ret <vscale x 2 x i64> %4
+}
+
+define <vscale x 2 x i64> @uxth_m_64_no_ptrue(<vscale x 16 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @uxth_m_64_no_ptrue(
+; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]], <vscale x 2 x i64> [[TMP2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP4]], <vscale x 2 x i64> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
+;
+  %4 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
+  %5 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> %2, <vscale x 2 x i1> %4, <vscale x 2 x i64> %1)
+  ret <vscale x 2 x i64> %5
+}
+
+define <vscale x 4 x i32> @uxth_z_32(<vscale x 4 x i32> %0) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @uxth_z_32(
+; CHECK-SAME: <vscale x 4 x i32> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP0]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+;
+  %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> %0)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @uxth_m_32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @uxth_m_32(
+; CHECK-SAME: <vscale x 4 x i32> [[TMP0:%.*]], <vscale x 4 x i32> [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP0]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+;
+  %3 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> %1, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> %0)
+  ret <vscale x 4 x i32> %3
+}
+
+define <vscale x 4 x i32> @uxth_x_32(<vscale x 16 x i1> %0, <vscale x 4 x i32> %1) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @uxth_x_32(
+; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 4 x i32> [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP4]]
+;
+  %3 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %0)
+  %4 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %3, <vscale x 4 x i32> %1)
+  ret <vscale x 4 x i32> %4
+}
+
+define <vscale x 4 x i32> @uxth_z_32_no_ptrue(<vscale x 16 x i1> %0, <vscale x 4 x i32> %1) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @uxth_z_32_no_ptrue(
+; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 4 x i32> [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP4]]
+;
+  %3 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %0)
+  %4 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %3, <vscale x 4 x i32> %1)
+  ret <vscale x 4 x i32> %4
+}
+
+define <vscale x 4 x i32> @uxth_m_32_no_ptrue(<vscale x 16 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @uxth_m_32_no_ptrue(
+; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 4 x i32> [[TMP1:%.*]], <vscale x 4 x i32> [[TMP2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP4]], <vscale x 4 x i32> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
+;
+  %4 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %0)
+  %5 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> %2, <vscale x 4 x i1> %4, <vscale x 4 x i32> %1)
+  ret <vscale x 4 x i32> %5
+}
+
+define <vscale x 2 x i64> @uxtw_z_64(<vscale x 2 x i64> %0) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @uxtw_z_64(
+; CHECK-SAME: <vscale x 2 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP0]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+;
+  %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> %0)
+  ret <vscale x 2 x i64> %2
+}
+
+define <vscale x 2 x i64> @uxtw_m_64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @uxtw_m_64(
+; CHECK-SAME: <vscale x 2 x i64> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP0]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+;
+  %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> %1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> %0)
+  ret <vscale x 2 x i64> %3
+}
+
+define <vscale x 2 x i64> @uxtw_x_64(<vscale x 16 x i1> %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @uxtw_x_64(
+; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP4]]
+;
+  %3 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
+  %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %3, <vscale x 2 x i64> %1)
+  ret <vscale x 2 x i64> %4
+}
+
+define <vscale x 2 x i64> @uxtw_z_64_no_ptrue(<vscale x 16 x i1> %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @uxtw_z_64_no_ptrue(
+; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP4]]
+;
+  %3 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
+  %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %3, <vscale x 2 x i64> %1)
+  ret <vscale x 2 x i64> %4
+}
+
+define <vscale x 2 x i64> @uxtw_m_64_no_ptrue(<vscale x 16 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @uxtw_m_64_no_ptrue(
+; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]], <vscale x 2 x i64> [[TMP2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP4]], <vscale x 2 x i64> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
+;
+  %4 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
+  %5 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> %2, <vscale x 2 x i1> %4, <vscale x 2 x i64> %1)
+  ret <vscale x 2 x i64> %5
+}
+
+attributes #0 = { "target-features"="+sve" }

>From a381043a2888e28a9eb59d2df8860467e1d6b201 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Wed, 30 Apr 2025 01:02:57 -0700
Subject: [PATCH 2/3] [AArch64][SVE] Combine UXT[BHW] intrinsics to AND.

Currently, we lower UXT[BHW] intrinsics into the corresponding UXT*
instruction. However, when the governing predicate is all-true or the
passthrough is undef (e.g. in the case of ``don't care'' merging), we
can lower them into AND immediate instructions instead.

For example:
```cpp

svuint64_t foo_z(svuint64_t x) {
  return svextb_z(svptrue_b64(), x);
}
```

Currently:
```
foo_z:
  ptrue   p0.d
  movi    v1.2d, #0000000000000000
  uxtb    z0.d, p0/m, z0.d
  ret
```

Becomes:
```
foo_z:
  and     z0.d, z0.d, #0xff
  ret
```

We do this early in InstCombine in case it unblocks other
simplifications.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 26 ++++++++++++
 .../InstCombine/AArch64/sve-intrinsic-uxt.ll  | 42 ++++++++-----------
 2 files changed, 44 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 594f1bff5c458..e9050d184f0f7 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2640,6 +2640,26 @@ static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
   return std::nullopt;
 }
 
+static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
+                                                      IntrinsicInst &II,
+                                                      unsigned NumBits) {
+  Value *Passthru = II.getOperand(0);
+  Value *Pg = II.getOperand(1);
+  Value *Op = II.getOperand(2);
+
+  // Convert UXT[BHW] to AND.
+  if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
+    auto *Ty = cast<VectorType>(II.getType());
+    auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
+    auto *Mask = ConstantVector::getSplat(
+        Ty->getElementCount(),
+        ConstantInt::get(Ty->getElementType(), MaskValue));
+    return IC.replaceInstUsesWith(II, IC.Builder.CreateAnd(Op, Mask));
+  }
+
+  return std::nullopt;
+}
+
 std::optional<Instruction *>
 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
                                      IntrinsicInst &II) const {
@@ -2745,6 +2765,12 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
     return instCombineSVEInsr(IC, II);
   case Intrinsic::aarch64_sve_ptrue:
     return instCombinePTrue(IC, II);
+  case Intrinsic::aarch64_sve_uxtb:
+    return instCombineSVEUxt(IC, II, 8);
+  case Intrinsic::aarch64_sve_uxth:
+    return instCombineSVEUxt(IC, II, 16);
+  case Intrinsic::aarch64_sve_uxtw:
+    return instCombineSVEUxt(IC, II, 32);
   }
 
   return std::nullopt;
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-uxt.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-uxt.ll
index e194d6346e64e..86986b510aa27 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-uxt.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-uxt.ll
@@ -6,7 +6,7 @@ target triple = "aarch64-unknown-linux-gnu"
 define <vscale x 2 x i64> @uxtb_z_64(<vscale x 2 x i64> %0) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @uxtb_z_64(
 ; CHECK-SAME: <vscale x 2 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 2 x i64> [[TMP0]], splat (i64 255)
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 ;
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> %0)
@@ -16,7 +16,7 @@ define <vscale x 2 x i64> @uxtb_z_64(<vscale x 2 x i64> %0) #0 {
 define <vscale x 2 x i64> @uxtb_m_64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @uxtb_m_64(
 ; CHECK-SAME: <vscale x 2 x i64> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 2 x i64> [[TMP0]], splat (i64 255)
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
 ;
   %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> %1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> %0)
@@ -26,8 +26,7 @@ define <vscale x 2 x i64> @uxtb_m_64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %
 define <vscale x 2 x i64> @uxtb_x_64(<vscale x 16 x i1> %0, <vscale x 2 x i64> %1) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @uxtb_x_64(
 ; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[TMP0]])
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 2 x i64> [[TMP1]], splat (i64 255)
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP4]]
 ;
   %3 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
@@ -62,7 +61,7 @@ define <vscale x 2 x i64> @uxtb_m_64_no_ptrue(<vscale x 16 x i1> %0, <vscale x 2
 define <vscale x 4 x i32> @uxtb_z_32(<vscale x 4 x i32> %0) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @uxtb_z_32(
 ; CHECK-SAME: <vscale x 4 x i32> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 4 x i32> [[TMP0]], splat (i32 255)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 ;
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> %0)
@@ -72,7 +71,7 @@ define <vscale x 4 x i32> @uxtb_z_32(<vscale x 4 x i32> %0) #0 {
 define <vscale x 4 x i32> @uxtb_m_32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @uxtb_m_32(
 ; CHECK-SAME: <vscale x 4 x i32> [[TMP0:%.*]], <vscale x 4 x i32> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 4 x i32> [[TMP0]], splat (i32 255)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
 ;
   %3 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> %1, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> %0)
@@ -82,8 +81,7 @@ define <vscale x 4 x i32> @uxtb_m_32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %
 define <vscale x 4 x i32> @uxtb_x_32(<vscale x 16 x i1> %0, <vscale x 4 x i32> %1) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @uxtb_x_32(
 ; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 4 x i32> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP0]])
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 4 x i32> [[TMP1]], splat (i32 255)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP4]]
 ;
   %3 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %0)
@@ -118,7 +116,7 @@ define <vscale x 4 x i32> @uxtb_m_32_no_ptrue(<vscale x 16 x i1> %0, <vscale x 4
 define <vscale x 8 x i16> @uxtb_z_16(<vscale x 8 x i16> %0) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @uxtb_z_16(
 ; CHECK-SAME: <vscale x 8 x i16> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 8 x i16> [[TMP0]], splat (i16 255)
 ; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
 ;
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> %0)
@@ -128,7 +126,7 @@ define <vscale x 8 x i16> @uxtb_z_16(<vscale x 8 x i16> %0) #0 {
 define <vscale x 8 x i16> @uxtb_m_16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @uxtb_m_16(
 ; CHECK-SAME: <vscale x 8 x i16> [[TMP0:%.*]], <vscale x 8 x i16> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 8 x i16> [[TMP0]], splat (i16 255)
 ; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP3]]
 ;
   %3 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> %1, <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> %0)
@@ -138,8 +136,7 @@ define <vscale x 8 x i16> @uxtb_m_16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %
 define <vscale x 8 x i16> @uxtb_x_16(<vscale x 16 x i1> %0, <vscale x 8 x i16> %1) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @uxtb_x_16(
 ; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 8 x i16> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP0]])
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 8 x i16> [[TMP1]], splat (i16 255)
 ; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP4]]
 ;
   %3 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %0)
@@ -174,7 +171,7 @@ define <vscale x 8 x i16> @uxtb_m_16_no_ptrue(<vscale x 16 x i1> %0, <vscale x 8
 define <vscale x 2 x i64> @uxth_z_64(<vscale x 2 x i64> %0) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @uxth_z_64(
 ; CHECK-SAME: <vscale x 2 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 2 x i64> [[TMP0]], splat (i64 65535)
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 ;
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> %0)
@@ -184,7 +181,7 @@ define <vscale x 2 x i64> @uxth_z_64(<vscale x 2 x i64> %0) #0 {
 define <vscale x 2 x i64> @uxth_m_64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @uxth_m_64(
 ; CHECK-SAME: <vscale x 2 x i64> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 2 x i64> [[TMP0]], splat (i64 65535)
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
 ;
   %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> %1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> %0)
@@ -194,8 +191,7 @@ define <vscale x 2 x i64> @uxth_m_64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %
 define <vscale x 2 x i64> @uxth_x_64(<vscale x 16 x i1> %0, <vscale x 2 x i64> %1) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @uxth_x_64(
 ; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[TMP0]])
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 2 x i64> [[TMP1]], splat (i64 65535)
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP4]]
 ;
   %3 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
@@ -230,7 +226,7 @@ define <vscale x 2 x i64> @uxth_m_64_no_ptrue(<vscale x 16 x i1> %0, <vscale x 2
 define <vscale x 4 x i32> @uxth_z_32(<vscale x 4 x i32> %0) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @uxth_z_32(
 ; CHECK-SAME: <vscale x 4 x i32> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 4 x i32> [[TMP0]], splat (i32 65535)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 ;
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> %0)
@@ -240,7 +236,7 @@ define <vscale x 4 x i32> @uxth_z_32(<vscale x 4 x i32> %0) #0 {
 define <vscale x 4 x i32> @uxth_m_32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @uxth_m_32(
 ; CHECK-SAME: <vscale x 4 x i32> [[TMP0:%.*]], <vscale x 4 x i32> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 4 x i32> [[TMP0]], splat (i32 65535)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
 ;
   %3 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> %1, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> %0)
@@ -250,8 +246,7 @@ define <vscale x 4 x i32> @uxth_m_32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %
 define <vscale x 4 x i32> @uxth_x_32(<vscale x 16 x i1> %0, <vscale x 4 x i32> %1) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @uxth_x_32(
 ; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 4 x i32> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP0]])
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 4 x i32> [[TMP1]], splat (i32 65535)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP4]]
 ;
   %3 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %0)
@@ -286,7 +281,7 @@ define <vscale x 4 x i32> @uxth_m_32_no_ptrue(<vscale x 16 x i1> %0, <vscale x 4
 define <vscale x 2 x i64> @uxtw_z_64(<vscale x 2 x i64> %0) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @uxtw_z_64(
 ; CHECK-SAME: <vscale x 2 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 2 x i64> [[TMP0]], splat (i64 4294967295)
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 ;
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> %0)
@@ -296,7 +291,7 @@ define <vscale x 2 x i64> @uxtw_z_64(<vscale x 2 x i64> %0) #0 {
 define <vscale x 2 x i64> @uxtw_m_64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @uxtw_m_64(
 ; CHECK-SAME: <vscale x 2 x i64> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 2 x i64> [[TMP0]], splat (i64 4294967295)
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
 ;
   %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> %1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> %0)
@@ -306,8 +301,7 @@ define <vscale x 2 x i64> @uxtw_m_64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %
 define <vscale x 2 x i64> @uxtw_x_64(<vscale x 16 x i1> %0, <vscale x 2 x i64> %1) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @uxtw_x_64(
 ; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[TMP0]])
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 2 x i64> [[TMP1]], splat (i64 4294967295)
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP4]]
 ;
   %3 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)

>From 89ce3d8bb0c9fb5a9d1dd8b89be9a6d9806ac155 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Wed, 30 Apr 2025 08:13:49 -0700
Subject: [PATCH 3/3] Emit Intrinsic::aarch64_sve_and_u rather than stock and.

---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  4 +-
 .../InstCombine/AArch64/sve-intrinsic-uxt.ll  | 42 +++++++++++--------
 2 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e9050d184f0f7..cd75811a1bad5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2654,7 +2654,9 @@ static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
     auto *Mask = ConstantVector::getSplat(
         Ty->getElementCount(),
         ConstantInt::get(Ty->getElementType(), MaskValue));
-    return IC.replaceInstUsesWith(II, IC.Builder.CreateAnd(Op, Mask));
+    auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
+                                           {Pg, Op, Mask});
+    return IC.replaceInstUsesWith(II, And);
   }
 
   return std::nullopt;
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-uxt.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-uxt.ll
index 86986b510aa27..755e11e231382 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-uxt.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-uxt.ll
@@ -6,7 +6,7 @@ target triple = "aarch64-unknown-linux-gnu"
 define <vscale x 2 x i64> @uxtb_z_64(<vscale x 2 x i64> %0) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @uxtb_z_64(
 ; CHECK-SAME: <vscale x 2 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 2 x i64> [[TMP0]], splat (i64 255)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.and.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> splat (i64 255))
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 ;
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> %0)
@@ -16,7 +16,7 @@ define <vscale x 2 x i64> @uxtb_z_64(<vscale x 2 x i64> %0) #0 {
 define <vscale x 2 x i64> @uxtb_m_64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @uxtb_m_64(
 ; CHECK-SAME: <vscale x 2 x i64> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 2 x i64> [[TMP0]], splat (i64 255)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.and.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> splat (i64 255))
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
 ;
   %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> %1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> %0)
@@ -26,7 +26,8 @@ define <vscale x 2 x i64> @uxtb_m_64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %
 define <vscale x 2 x i64> @uxtb_x_64(<vscale x 16 x i1> %0, <vscale x 2 x i64> %1) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @uxtb_x_64(
 ; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 2 x i64> [[TMP1]], splat (i64 255)
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.and.u.nxv2i64(<vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> splat (i64 255))
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP4]]
 ;
   %3 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
@@ -61,7 +62,7 @@ define <vscale x 2 x i64> @uxtb_m_64_no_ptrue(<vscale x 16 x i1> %0, <vscale x 2
 define <vscale x 4 x i32> @uxtb_z_32(<vscale x 4 x i32> %0) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @uxtb_z_32(
 ; CHECK-SAME: <vscale x 4 x i32> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 4 x i32> [[TMP0]], splat (i32 255)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.and.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> splat (i32 255))
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 ;
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> %0)
@@ -71,7 +72,7 @@ define <vscale x 4 x i32> @uxtb_z_32(<vscale x 4 x i32> %0) #0 {
 define <vscale x 4 x i32> @uxtb_m_32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @uxtb_m_32(
 ; CHECK-SAME: <vscale x 4 x i32> [[TMP0:%.*]], <vscale x 4 x i32> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 4 x i32> [[TMP0]], splat (i32 255)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.and.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> splat (i32 255))
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
 ;
   %3 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> %1, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> %0)
@@ -81,7 +82,8 @@ define <vscale x 4 x i32> @uxtb_m_32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %
 define <vscale x 4 x i32> @uxtb_x_32(<vscale x 16 x i1> %0, <vscale x 4 x i32> %1) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @uxtb_x_32(
 ; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 4 x i32> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 4 x i32> [[TMP1]], splat (i32 255)
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.and.u.nxv4i32(<vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> splat (i32 255))
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP4]]
 ;
   %3 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %0)
@@ -116,7 +118,7 @@ define <vscale x 4 x i32> @uxtb_m_32_no_ptrue(<vscale x 16 x i1> %0, <vscale x 4
 define <vscale x 8 x i16> @uxtb_z_16(<vscale x 8 x i16> %0) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @uxtb_z_16(
 ; CHECK-SAME: <vscale x 8 x i16> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 8 x i16> [[TMP0]], splat (i16 255)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.and.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> splat (i16 255))
 ; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
 ;
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> %0)
@@ -126,7 +128,7 @@ define <vscale x 8 x i16> @uxtb_z_16(<vscale x 8 x i16> %0) #0 {
 define <vscale x 8 x i16> @uxtb_m_16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @uxtb_m_16(
 ; CHECK-SAME: <vscale x 8 x i16> [[TMP0:%.*]], <vscale x 8 x i16> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 8 x i16> [[TMP0]], splat (i16 255)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.and.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> splat (i16 255))
 ; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP3]]
 ;
   %3 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> %1, <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> %0)
@@ -136,7 +138,8 @@ define <vscale x 8 x i16> @uxtb_m_16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %
 define <vscale x 8 x i16> @uxtb_x_16(<vscale x 16 x i1> %0, <vscale x 8 x i16> %1) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @uxtb_x_16(
 ; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 8 x i16> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 8 x i16> [[TMP1]], splat (i16 255)
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.and.u.nxv8i16(<vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> splat (i16 255))
 ; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP4]]
 ;
   %3 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %0)
@@ -171,7 +174,7 @@ define <vscale x 8 x i16> @uxtb_m_16_no_ptrue(<vscale x 16 x i1> %0, <vscale x 8
 define <vscale x 2 x i64> @uxth_z_64(<vscale x 2 x i64> %0) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @uxth_z_64(
 ; CHECK-SAME: <vscale x 2 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 2 x i64> [[TMP0]], splat (i64 65535)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.and.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> splat (i64 65535))
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 ;
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> %0)
@@ -181,7 +184,7 @@ define <vscale x 2 x i64> @uxth_z_64(<vscale x 2 x i64> %0) #0 {
 define <vscale x 2 x i64> @uxth_m_64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @uxth_m_64(
 ; CHECK-SAME: <vscale x 2 x i64> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 2 x i64> [[TMP0]], splat (i64 65535)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.and.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> splat (i64 65535))
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
 ;
   %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> %1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> %0)
@@ -191,7 +194,8 @@ define <vscale x 2 x i64> @uxth_m_64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %
 define <vscale x 2 x i64> @uxth_x_64(<vscale x 16 x i1> %0, <vscale x 2 x i64> %1) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @uxth_x_64(
 ; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 2 x i64> [[TMP1]], splat (i64 65535)
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.and.u.nxv2i64(<vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> splat (i64 65535))
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP4]]
 ;
   %3 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
@@ -226,7 +230,7 @@ define <vscale x 2 x i64> @uxth_m_64_no_ptrue(<vscale x 16 x i1> %0, <vscale x 2
 define <vscale x 4 x i32> @uxth_z_32(<vscale x 4 x i32> %0) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @uxth_z_32(
 ; CHECK-SAME: <vscale x 4 x i32> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 4 x i32> [[TMP0]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.and.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> splat (i32 65535))
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 ;
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> %0)
@@ -236,7 +240,7 @@ define <vscale x 4 x i32> @uxth_z_32(<vscale x 4 x i32> %0) #0 {
 define <vscale x 4 x i32> @uxth_m_32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @uxth_m_32(
 ; CHECK-SAME: <vscale x 4 x i32> [[TMP0:%.*]], <vscale x 4 x i32> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 4 x i32> [[TMP0]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.and.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> splat (i32 65535))
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
 ;
   %3 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> %1, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> %0)
@@ -246,7 +250,8 @@ define <vscale x 4 x i32> @uxth_m_32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %
 define <vscale x 4 x i32> @uxth_x_32(<vscale x 16 x i1> %0, <vscale x 4 x i32> %1) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @uxth_x_32(
 ; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 4 x i32> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 4 x i32> [[TMP1]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.and.u.nxv4i32(<vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> splat (i32 65535))
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP4]]
 ;
   %3 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %0)
@@ -281,7 +286,7 @@ define <vscale x 4 x i32> @uxth_m_32_no_ptrue(<vscale x 16 x i1> %0, <vscale x 4
 define <vscale x 2 x i64> @uxtw_z_64(<vscale x 2 x i64> %0) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @uxtw_z_64(
 ; CHECK-SAME: <vscale x 2 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 2 x i64> [[TMP0]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.and.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> splat (i64 4294967295))
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 ;
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> %0)
@@ -291,7 +296,7 @@ define <vscale x 2 x i64> @uxtw_z_64(<vscale x 2 x i64> %0) #0 {
 define <vscale x 2 x i64> @uxtw_m_64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @uxtw_m_64(
 ; CHECK-SAME: <vscale x 2 x i64> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = and <vscale x 2 x i64> [[TMP0]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.and.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> splat (i64 4294967295))
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
 ;
   %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> %1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> %0)
@@ -301,7 +306,8 @@ define <vscale x 2 x i64> @uxtw_m_64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %
 define <vscale x 2 x i64> @uxtw_x_64(<vscale x 16 x i1> %0, <vscale x 2 x i64> %1) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @uxtw_x_64(
 ; CHECK-SAME: <vscale x 16 x i1> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 2 x i64> [[TMP1]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.and.u.nxv2i64(<vscale x 2 x i1> [[TMP3]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> splat (i64 4294967295))
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP4]]
 ;
   %3 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)