[llvm] [AArch64][SVE] optimisation for unary SVE store intrinsics with no active lanes (PR #95793)

Mon Jun 17 07:14:30 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: None (Lukacma)

<details>
<summary>Changes</summary>

This patch extends https://github.com/llvm/llvm-project/pull/73964 and adds optimisation of store SVE intrinsics when predicate is zero.

---
Full diff: https://github.com/llvm/llvm-project/pull/95793.diff


2 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+45) 
- (added) llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-stores.ll (+310) 


``````````diff

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 9f5756fc7e401..d2a31a6ce4dc6 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -985,6 +985,16 @@ static bool isAllActivePredicate(Value *Pred) {
                          m_ConstantInt<AArch64SVEPredPattern::all>()));
 }
 
+// Erase unary operation where predicate has all inactive lanes
+static std::optional<Instruction *>
+instCombineSVENoActiveUnaryErase(InstCombiner &IC, IntrinsicInst &II,
+                                 int PredPos) {
+  if (match(II.getOperand(PredPos), m_ZeroInt())) {
+    return IC.eraseInstFromFunction(II);
+  }
+  return std::nullopt;
+}
+
 static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
                                                       IntrinsicInst &II) {
   // svsel(ptrue, x, y) => x
@@ -1417,6 +1427,10 @@ instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
   Value *Pred = II.getOperand(1);
   Value *PtrOp = II.getOperand(2);
 
+  // Remove when all lanes are inactive
+  if (auto II_NA = instCombineSVENoActiveUnaryErase(IC, II, 0))
+    return II_NA;
+
   if (isAllActivePredicate(Pred)) {
     StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
     Store->copyMetadata(II);
@@ -1775,6 +1789,10 @@ instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {
   Value *Index = II.getOperand(3);
   Type *Ty = Val->getType();
 
+  // Remove when all lanes are inactive
+  if (auto II_NA = instCombineSVENoActiveUnaryErase(IC, II, 0))
+    return II_NA;
+
   // Contiguous scatter => masked store.
   // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
   // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
@@ -1971,6 +1989,33 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
   switch (IID) {
   default:
     break;
+
+  case Intrinsic::aarch64_sve_st1_scatter:
+  case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
+  case Intrinsic::aarch64_sve_st1_scatter_sxtw:
+  case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
+  case Intrinsic::aarch64_sve_st1_scatter_uxtw:
+  case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
+  case Intrinsic::aarch64_sve_st1dq:
+  case Intrinsic::aarch64_sve_st1q_scatter_index:
+  case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
+  case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
+  case Intrinsic::aarch64_sve_st1wq:
+  case Intrinsic::aarch64_sve_stnt1:
+  case Intrinsic::aarch64_sve_stnt1_scatter:
+  case Intrinsic::aarch64_sve_stnt1_scatter_index:
+  case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
+  case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
+    return instCombineSVENoActiveUnaryErase(IC, II, 1);
+  case Intrinsic::aarch64_sve_st2:
+  case Intrinsic::aarch64_sve_st2q:
+    return instCombineSVENoActiveUnaryErase(IC, II, 2);
+  case Intrinsic::aarch64_sve_st3:
+  case Intrinsic::aarch64_sve_st3q:
+    return instCombineSVENoActiveUnaryErase(IC, II, 3);
+  case Intrinsic::aarch64_sve_st4:
+  case Intrinsic::aarch64_sve_st4q:
+    return instCombineSVENoActiveUnaryErase(IC, II, 4);
   case Intrinsic::aarch64_neon_fmaxnm:
   case Intrinsic::aarch64_neon_fminnm:
     return instCombineMaxMinNM(IC, II);
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-stores.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-stores.ll
new file mode 100644
index 0000000000000..00681dee3fcb5
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-stores.ll
@@ -0,0 +1,310 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;RUN: opt -S -passes=instcombine < %s | FileCheck %s
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @test_st1(ptr %a, <vscale x 16 x i8> %b)  #0 {
+; CHECK-LABEL: define void @test_st1(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.aarch64.sve.st1.nxv16i8(<vscale x 16 x i8> %b, <vscale x 16 x i1> zeroinitializer, ptr %a)
+  ret void
+}
+
+define void @test_st1_scatter(<vscale x 2 x i16> %data_trunc, ptr %base, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: define void @test_st1_scatter(
+; CHECK-SAME: <vscale x 2 x i16> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.aarch64.sve.st1.scatter.nxv2i16(<vscale x 2 x i16> %data_trunc,
+  <vscale x 2 x i1> zeroinitializer,
+  ptr %base,
+  <vscale x 2 x i64> %b)
+  ret void
+}
+
+define void @test_st1_scatter_index(<vscale x 2 x i32> %data_trunc, ptr %base, <vscale x 2 x i64> %offsets) #0 {
+; CHECK-LABEL: define void @test_st1_scatter_index(
+; CHECK-SAME: <vscale x 2 x i32> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFFSETS:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st1.scatter.index.nxv2i32(<vscale x 2 x i32> [[DATA_TRUNC]], <vscale x 2 x i1> zeroinitializer, ptr [[BASE]], <vscale x 2 x i64> [[OFFSETS]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.aarch64.sve.st1.scatter.index.nxv2i32(<vscale x 2 x i32> %data_trunc,
+  <vscale x 2 x i1> zeroinitializer,
+  ptr %base,
+  <vscale x 2 x i64> %offsets)
+  ret void
+}
+
+define void @test_st1_scatter_scalar_offset(<vscale x 4 x i8> %data_trunc, <vscale x 4 x i32> %base)  #0 {
+; CHECK-LABEL: define void @test_st1_scatter_scalar_offset(
+; CHECK-SAME: <vscale x 4 x i8> [[DATA_TRUNC:%.*]], <vscale x 4 x i32> [[BASE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.aarch64.sve.st1.scatter.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i8> %data_trunc,
+  <vscale x 4 x i1> zeroinitializer,
+  <vscale x 4 x i32> %base,
+  i64 16)
+  ret void
+}
+
+define void @test_st1_scatter_sxtw(<vscale x 4 x i8> %data_trunc, ptr %base, <vscale x 4 x i32> %offsets)  #0 {
+; CHECK-LABEL: define void @test_st1_scatter_sxtw(
+; CHECK-SAME: <vscale x 4 x i8> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[OFFSETS:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4i8(<vscale x 4 x i8> %data_trunc,
+  <vscale x 4 x i1> zeroinitializer,
+  ptr %base,
+  <vscale x 4 x i32> %offsets)
+  ret void
+}
+
+define void @test_st1_scatter_sxtw_index(<vscale x 4 x i16> %data_trunc, ptr %base, <vscale x 4 x i32> %indices)  #0 {
+; CHECK-LABEL: define void @test_st1_scatter_sxtw_index(
+; CHECK-SAME: <vscale x 4 x i16> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[INDICES:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4i16(<vscale x 4 x i16> %data_trunc,
+  <vscale x 4 x i1> zeroinitializer,
+  ptr %base,
+  <vscale x 4 x i32> %indices)
+  ret void
+}
+
+define void @test_st1_scatter_uxtw(<vscale x 4 x i8> %data_trunc, ptr %base, <vscale x 4 x i32> %offsets)  #0 {
+; CHECK-LABEL: define void @test_st1_scatter_uxtw(
+; CHECK-SAME: <vscale x 4 x i8> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[OFFSETS:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4i8(<vscale x 4 x i8> %data_trunc,
+  <vscale x 4 x i1> zeroinitializer,
+  ptr %base,
+  <vscale x 4 x i32> %offsets)
+  ret void
+}
+
+define void @test_st1_scatter_uxtw_index(<vscale x 4 x i16> %data_trunc, ptr %base, <vscale x 4 x i32> %indices)  #0 {
+; CHECK-LABEL: define void @test_st1_scatter_uxtw_index(
+; CHECK-SAME: <vscale x 4 x i16> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[INDICES:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv4i16(<vscale x 4 x i16> %data_trunc,
+  <vscale x 4 x i1> zeroinitializer,
+  ptr %base,
+  <vscale x 4 x i32> %indices)
+  ret void
+}
+
+define void @test_st1dq(<vscale x 2 x i64> %zt, ptr %gep1)  #0 {
+; CHECK-LABEL: define void @test_st1dq(
+; CHECK-SAME: <vscale x 2 x i64> [[ZT:%.*]], ptr [[GEP1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> %zt, <vscale x 1 x i1> zeroinitializer, ptr %gep1)
+  ret void
+}
+
+define void @test_st1q_scatter_index(<vscale x 8 x i16> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx) #0 {
+; CHECK-LABEL: define void @test_st1q_scatter_index(
+; CHECK-SAME: <vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[PG:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.aarch64.sve.st1q.scatter.index.nxv8i16(<vscale x 8 x i16> %data, <vscale x 1 x i1> zeroinitializer, ptr %base, <vscale x 2 x i64> %idx)
+  ret void
+}
+
+define void @test_st1q_scatter_scalar_offset(<vscale x 2 x i64> %data, <vscale x 2 x i64> %base)  #0 {
+; CHECK-LABEL: define void @test_st1q_scatter_scalar_offset(
+; CHECK-SAME: <vscale x 2 x i64> [[DATA:%.*]], <vscale x 2 x i64> [[BASE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> %data, <vscale x 1 x i1> zeroinitializer, <vscale x 2 x i64> %base, i64 0)
+  ret void
+}
+
+define void @test_st1q_scatter_vector_offset(<vscale x 8 x i16> %data, ptr %base, <vscale x 2 x i64> %off)  #0 {
+; CHECK-LABEL: define void @test_st1q_scatter_vector_offset(
+; CHECK-SAME: <vscale x 8 x i16> [[DATA:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8i16(<vscale x 8 x i16> %data, <vscale x 1 x i1> zeroinitializer, ptr %base, <vscale x 2 x i64> %off)
+  ret void
+}
+
+define void @test_st1wq(ptr %a, <vscale x 4 x i32> %b)  #0 {
+; CHECK-LABEL: define void @test_st1wq(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> %b, <vscale x 1 x i1> zeroinitializer, ptr %a)
+  ret void
+}
+
+
+define void @test_st2(ptr %a, <vscale x 8 x i32> %b)  #0 {
+; CHECK-LABEL: define void @test_st2(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 8 x i32> [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %b, i64 0)
+  %1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %b, i64 4)
+  tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> zeroinitializer, ptr %a)
+  ret void
+}
+
+define void @test_st2q(ptr %a, <vscale x 8 x i32> %b)  #0 {
+; CHECK-LABEL: define void @test_st2q(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 8 x i32> [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %b, i64 0)
+  %1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %b, i64 4)
+  tail call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> zeroinitializer, ptr %a)
+  ret void
+}
+
+define void @test_st3(ptr %a, <vscale x 12 x i32> %b)  #0 {
+; CHECK-LABEL: define void @test_st3(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 12 x i32> [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 0)
+  %1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 4)
+  %2 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 8)
+  tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> zeroinitializer, ptr %a)
+  ret void
+}
+
+define void @test_st3q(ptr %a, <vscale x 12 x i32> %b)  #0 {
+; CHECK-LABEL: define void @test_st3q(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 12 x i32> [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 0)
+  %1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 4)
+  %2 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 8)
+  tail call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> zeroinitializer, ptr %a)
+  ret void
+}
+
+define void @test_st4(ptr %a, <vscale x 16 x i32> %b)  #0 {
+; CHECK-LABEL: define void @test_st4(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 16 x i32> [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 0)
+  %1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 4)
+  %2 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 8)
+  %3 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 12)
+  tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, <vscale x 4 x i1> zeroinitializer, ptr %a)
+  ret void
+}
+
+define void @test_st4q(ptr %a, <vscale x 16 x i32> %b)  #0 {
+; CHECK-LABEL: define void @test_st4q(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 16 x i32> [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 0)
+  %1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 4)
+  %2 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 8)
+  %3 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 12)
+  tail call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, <vscale x 4 x i1> zeroinitializer, ptr %a)
+  ret void
+}
+
+define void @test_stnt1(ptr %a, <vscale x 16 x i8> %b)  #0 {
+; CHECK-LABEL: define void @test_stnt1(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.aarch64.sve.stnt1.nxv16i8(<vscale x 16 x i8> %b, <vscale x 16 x i1> zeroinitializer, ptr %a)
+  ret void
+}
+
+define void @test_stnt1_scatter(<vscale x 2 x i16> %data_trunc, ptr %base, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: define void @test_stnt1_scatter(
+; CHECK-SAME: <vscale x 2 x i16> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.aarch64.sve.stnt1.scatter.nxv2i16(<vscale x 2 x i16> %data_trunc,
+  <vscale x 2 x i1> zeroinitializer,
+  ptr %base,
+  <vscale x 2 x i64> %b)
+  ret void
+}
+
+define void @test_stnt1_scatter_index(<vscale x 2 x i32> %data_trunc, ptr %base, <vscale x 2 x i64> %offsets) #0 {
+; CHECK-LABEL: define void @test_stnt1_scatter_index(
+; CHECK-SAME: <vscale x 2 x i32> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFFSETS:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i32(<vscale x 2 x i32> %data_trunc,
+  <vscale x 2 x i1> zeroinitializer,
+  ptr %base,
+  <vscale x 2 x i64> %offsets)
+  ret void
+}
+
+define void @test_stnt1_scatter_scalar_offset(<vscale x 4 x i8> %data_trunc, <vscale x 4 x i32> %base)  #0 {
+; CHECK-LABEL: define void @test_stnt1_scatter_scalar_offset(
+; CHECK-SAME: <vscale x 4 x i8> [[DATA_TRUNC:%.*]], <vscale x 4 x i32> [[BASE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i8> %data_trunc,
+  <vscale x 4 x i1> zeroinitializer,
+  <vscale x 4 x i32> %base,
+  i64 16)
+  ret void
+}
+
+define void @test_stnt1_scatter_uxtw(<vscale x 4 x i8> %data_trunc, ptr %base, <vscale x 4 x i32> %offsets)  #0 {
+; CHECK-LABEL: define void @test_stnt1_scatter_uxtw(
+; CHECK-SAME: <vscale x 4 x i8> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[OFFSETS:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i8(<vscale x 4 x i8> %data_trunc,
+  <vscale x 4 x i1> zeroinitializer,
+  ptr %base,
+  <vscale x 4 x i32> %offsets)
+  ret void
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/95793