[llvm] [LoopVectorize] Enable shuffle padding for masked interleaved accesses (PR #75329)

Wed Dec 13 04:17:30 PST 2023

llvmbot wrote:



@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-llvm-analysis

Author: None (huhu233)

<details>
<summary>Changes</summary>

typedef struct {
    float x;
    float y;
    float z;
} patic;

for (int i = 0; i < num; i++) {
  ps[i].x = factor * ps[i].x;
  ps[i].y = factor * ps[i].y;
}

This patch pads the gap of the interleave store group to eliminate masked.store, which helps to generate better code in Interleaved Access pass, as shown,

%wide.vec = load <12 x float>; 0,1,2,3,...,11
%shuffle1 = shuffle %wide.vec, poison, <0, 3, 6, 9> ; 0,3,6,9 %shuffle2 = shuffle %wide.vec, poison, <1, 4, 7, 10> ; 1,4,7,10 %padded = shuffle %wide.vec, poison, <2, 5, 8, 11> ; 2,5,8,11

%concate1 = shuffle %op1, %op2, <0, 1, ..., 7> ; 0,3,6,9,1,4,7,10 %concate2 = shuffle %padded, poison,
            <0, 1, ..., 3, undef, undef, undef, undef> ; 2,5,8,11,poison,...,poison
%concateFinal = shuffle %concate1, %concate2,

This patch adds some restrictions for shuffle padding, that is target interleaved store groups should have matched interleaved load groups, which means,
1. The value operand of StoreInst should comes from LoadInst.
2. The store group and the load group accesses the same underlying object.

---

Patch is 24.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/75329.diff


5 Files Affected:

- (modified) llvm/include/llvm/Analysis/VectorUtils.h (+7) 
- (modified) llvm/lib/Analysis/VectorUtils.cpp (+23) 
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+67-5) 
- (added) llvm/test/Transforms/LoopVectorize/AArch64/sve-structured-store-cost.ll (+47) 
- (added) llvm/test/Transforms/LoopVectorize/AArch64/sve-structured-store.ll (+279) 


``````````diff

diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 55a6aa645a86e2..01727474432aeb 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -819,6 +819,13 @@ class InterleavedAccessInfo {
   /// Returns true if we have any interleave groups.
   bool hasGroups() const { return !InterleaveGroups.empty(); }
 
+  /// Check if the interleaved store group has matched load group, which means
+  /// the store should be satisfied with some restrictions,
+  /// 1. The value operand of StoreInst should comes from LoadInst.
+  /// 2. The store group and the load group accesses the same memory.
+  Value *hasMatchedLoadGroupForStore(Instruction *Inst, BasicBlock *BB,
+                                     Value *Ptr) const;
+
 private:
   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
   /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 91d8c31fa062de..b1ef36109c1669 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1441,6 +1441,29 @@ void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
   RequiresScalarEpilogue = false;
 }
 
+Value *InterleavedAccessInfo::hasMatchedLoadGroupForStore(Instruction *Inst,
+                                                          BasicBlock *BB,
+                                                          Value *Ptr) const {
+  if (isa<PHINode>(Inst) || Inst->getParent() != BB)
+    return nullptr;
+
+  if (isa<LoadInst>(Inst)) {
+    Value *V = getUnderlyingObject(Inst->getOperand(0));
+    auto Group = getInterleaveGroup(Inst);
+    if (Group && (V == Ptr))
+      return Group->getInsertPos();
+  }
+
+  for (unsigned It = 0; It < Inst->getNumOperands(); It++) {
+    if (Instruction *I = dyn_cast<Instruction>(Inst->getOperand(It)))
+      if (Value *MatchedLoadGroupEntry =
+              hasMatchedLoadGroupForStore(I, BB, Ptr))
+        return MatchedLoadGroupEntry;
+  }
+
+  return nullptr;
+}
+
 template <typename InstT>
 void InterleaveGroup<InstT>::addMetadata(InstT *NewInst) const {
   llvm_unreachable("addMetadata can only be used for Instruction");
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f82e161fb846d1..09daa3d32d3b3e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -408,6 +408,10 @@ static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
 // after prolog. See `emitIterationCountCheck`.
 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
 
+static cl::opt<bool> EnableShufflePadding(
+    "enable-shuffle-padding", cl::init(true), cl::Hidden,
+    cl::desc("Enable shuffle padding to generate structure store."));
+
 /// A helper function that returns true if the given type is irregular. The
 /// type is irregular if its allocated size doesn't equal the store size of an
 /// element of the corresponding vector type.
@@ -796,6 +800,11 @@ class InnerLoopVectorizer {
   // correct start value of reduction PHIs when vectorizing the epilogue.
   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
       ReductionResumeValues;
+
+  /// The map stores shuffles which are used to pad the gap of the interleaved
+  /// store groups. The key for the map is the entry of the load group who is
+  /// matched to the related store group.
+  MapVector<Value *, SmallVector<SmallVector<Value *, 4>, 4>> PaddedShufflesMap;
 };
 
 class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -1702,6 +1711,11 @@ class LoopVectorizationCostModel {
   /// \p VF is the vectorization factor chosen for the original loop.
   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
 
+  Value *hasMatchedLoadGroupForStore(Instruction *Inst, BasicBlock *BB,
+                                     Value *Ptr) const {
+    return InterleaveInfo.hasMatchedLoadGroupForStore(Inst, BB, Ptr);
+  }
+
 private:
   unsigned NumPredStores = 0;
 
@@ -2557,6 +2571,16 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
                        : ShuffledMask;
   };
 
+  Value *MatchedLoad = nullptr;
+  bool IsShufflePadding = false;
+  if (EnableShufflePadding && useMaskedInterleavedAccesses(*TTI) &&
+      TTI->enableScalableVectorization()) {
+    IsShufflePadding = true;
+    if (isa<StoreInst>(Instr) && (Group->getNumMembers() != Group->getFactor()))
+      MatchedLoad = Cost->hasMatchedLoadGroupForStore(
+          Instr, Instr->getParent(), getUnderlyingObject(Instr->getOperand(1)));
+  }
+
   // Vectorize the interleaved load group.
   if (isa<LoadInst>(Instr)) {
     Value *MaskForGaps = nullptr;
@@ -2626,8 +2650,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
     for (unsigned I = 0; I < InterleaveFactor; ++I) {
       Instruction *Member = Group->getMember(I);
 
-      // Skip the gaps in the group.
-      if (!Member)
+      SmallVector<Value *, 4> Shuffles;
+      // Skip the gaps in the group if there are no paddings.
+      if (!Member && !IsShufflePadding)
         continue;
 
       auto StrideMask =
@@ -2636,6 +2661,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
         Value *StridedVec = Builder.CreateShuffleVector(
             NewLoads[Part], StrideMask, "strided.vec");
 
+        if (!Member) {
+          if (Group->isReverse())
+            StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
+          Shuffles.push_back(StridedVec);
+          continue;
+        }
         // If this member has different type, cast the result type.
         if (Member->getType() != ScalarTy) {
           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
@@ -2646,9 +2677,13 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
         if (Group->isReverse())
           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
 
+        Shuffles.push_back(StridedVec);
+
         State.set(VPDefs[J], StridedVec, Part);
       }
-      ++J;
+      PaddedShufflesMap[Instr].push_back(Shuffles);
+      if (Member)
+        ++J;
     }
     return;
   }
@@ -2672,6 +2707,24 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
              "Fail to get a member from an interleaved store group");
       Instruction *Member = Group->getMember(i);
 
+      if (!Member && MatchedLoad) {
+        // %wide.vec = load <12 x float>; 0,1,2,3,...,11
+        // %shuffle1 = shuffle %wide.vec, poison, <0, 3, 6, 9> ; 0,3,6,9
+        // %shuffle2 = shuffle %wide.vec, poison, <1, 4, 7, 10> ; 1,4,7,10
+        // %padded = shuffle %wide.vec, poison, <2, 5, 8, 11> ; 2,5,8,11
+        //
+        // %concate1 = shuffle %op1, %op2, <0, 1, ..., 7> ; 0,3,6,9,1,4,7,10
+        // %concate2 = shuffle %padded, poison,
+        //    <0, 1, ..., 3, undef, undef, undef, undef>
+        //    ; 2,5,8,11,poison,...,poison
+        // %concateFinal = shuffle %concate1, %concate2,
+        //    <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; 0,1,2,3,...,11
+        // store <12 x float> %concateFinal
+        Value *PaddedShuffle = PaddedShufflesMap[MatchedLoad][i][Part];
+        StoredVecs.push_back(PaddedShuffle);
+        continue;
+      }
+
       // Skip the gaps in the group.
       if (!Member) {
         Value *Undef = PoisonValue::get(SubVT);
@@ -2696,7 +2749,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
     // Interleave all the smaller vectors into one wider vector.
     Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
     Instruction *NewStoreInstr;
-    if (BlockInMask || MaskForGaps) {
+    if ((BlockInMask || MaskForGaps) && !MatchedLoad) {
       Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
                                                 Group->getAlign(), GroupMask);
@@ -6325,10 +6378,19 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
     if (Group->getMember(IF))
       Indices.push_back(IF);
 
+  bool IsShufflePaddingStore = false;
+  if (EnableShufflePadding && useMaskedInterleavedAccesses(TTI) &&
+      TTI.enableScalableVectorization() && !VF.isScalable())
+    IsShufflePaddingStore = true;
+
   // Calculate the cost of the whole interleaved group.
+  // If shuffle padding is enabled, ignore gaps.
   bool UseMaskForGaps =
       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
-      (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
+      (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()) &&
+       (!IsShufflePaddingStore ||
+        !hasMatchedLoadGroupForStore(I, I->getParent(),
+                                     getUnderlyingObject(I->getOperand(1)))));
   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
       AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-structured-store-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-structured-store-cost.ll
new file mode 100644
index 00000000000000..79a1d92cd1454b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-structured-store-cost.ll
@@ -0,0 +1,47 @@
+; REQUIRES: asserts
+; RUN: opt -enable-shuffle-padding=true -enable-masked-interleaved-mem-accesses=true -passes=loop-vectorize -debug-only=loop-vectorize  -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=512 -S < %s 2>&1  | FileCheck %s --check-prefixes=PADDING
+; RUN: opt -enable-shuffle-padding=false -enable-masked-interleaved-mem-accesses=true -passes=loop-vectorize -debug-only=loop-vectorize  -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=512 -S < %s 2>&1  | FileCheck %s --check-prefixes=NO-PADDING
+
+%struct.patic = type { float, float, float }
+
+; for (int i = 0; i < num; i++) {
+;   ps[i].x = factor * ps[i].x;
+;   ps[i].y = factor * ps[i].y;
+; }
+;
+; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind uwtable vscale_range(2,2)
+define void @shufflePadding(i32 noundef %num, ptr nocapture noundef %ps) {
+; PADDING-LABEL: 'shufflePadding'
+; PADDING: LV: Found an estimated cost of 3 for VF 16 For instruction:   store float %mul6, ptr %y, align 4
+
+; NO-PADDING-LABEL: 'shufflePadding'
+; NO-PADDING: LV: Found an estimated cost of 188 for VF 16 For instruction:   store float %mul6, ptr %y, align 4
+entry:
+  %cmp19 = icmp sgt i32 %num, 0
+  br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %num to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.patic, ptr %ps, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %mul = fmul fast float %0, 0x40019999A0000000
+  store float %mul, ptr %arrayidx, align 4
+  %y = getelementptr inbounds %struct.patic, ptr %arrayidx, i64 0, i32 1
+  %1 = load float, ptr %y, align 4
+  %mul6 = fmul fast float %1, 0x40019999A0000000
+  store float %mul6, ptr %y, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-structured-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-structured-store.ll
new file mode 100644
index 00000000000000..6c02274073e585
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-structured-store.ll
@@ -0,0 +1,279 @@
+; RUN: opt -mtriple=aarch64 -mattr=+sve -S -passes=loop-vectorize,interleaved-access  -enable-masked-interleaved-mem-accesses=true -sve-gather-overhead=10 -sve-scatter-overhead=10  -enable-shuffle-padding=true -force-vector-width=16 -aarch64-sve-vector-bits-min=512 < %s -o - | FileCheck %s --check-prefixes=ENABLE-SHUFFLE-PADDING
+; RUN: opt -mtriple=aarch64 -mattr=+sve -S -passes=loop-vectorize,interleaved-access  -enable-masked-interleaved-mem-accesses=true -sve-gather-overhead=10 -sve-scatter-overhead=10  -enable-shuffle-padding=false -force-vector-width=16  -aarch64-sve-vector-bits-min=512 < %s -o - | FileCheck %s --check-prefixes=DISABLE-SHUFFLE-PADDING
+
+%struct.patic = type { float, float, float }
+
+; for (int i = 0; i < num; i++) {
+;   ps[i].x = factor * ps[i].x;
+;   ps[i].y = factor * ps[i].y;
+; }
+;
+; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind uwtable vscale_range(2,2)
+define void @test(i32 noundef %num, ptr nocapture noundef %ps) {
+; ENABLE-SHUFFLE-PADDING-LABEL: @test(
+; ENABLE-SHUFFLE-PADDING:       vector.body:
+; ENABLE-SHUFFLE-PADDING:         call void @llvm.aarch64.sve.st3.nxv4f32(
+;
+; DISABLE-SHUFFLE-PADDING-LABEL: @test(
+; DISABLE-SHUFFLE-PADDING:       vector.body:
+; DISABLE-SHUFFLE-PADDING-NOT:     call void @llvm.aarch64.sve.st3.nxv4f32(
+
+entry:
+  %cmp19 = icmp sgt i32 %num, 0
+  br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %num to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.patic, ptr %ps, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %mul = fmul fast float %0, 0x40019999A0000000
+  store float %mul, ptr %arrayidx, align 4
+  %y = getelementptr inbounds %struct.patic, ptr %arrayidx, i64 0, i32 1
+  %1 = load float, ptr %y, align 4
+  %mul6 = fmul fast float %1, 0x40019999A0000000
+  store float %mul6, ptr %y, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; for (int i = 0; i < num; i++) {
+;   ps[i].x = factor * ps[i].x;
+;   ps[i].z = factor * ps[i].z;
+; }
+;
+; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind uwtable vscale_range(2,2)
+define void @test1(i32 noundef %num, ptr nocapture noundef %ps) {
+; ENABLE-SHUFFLE-PADDING-LABEL: @test1(
+; ENABLE-SHUFFLE-PADDING:       vector.body:
+; ENABLE-SHUFFLE-PADDING:         call void @llvm.aarch64.sve.st3.nxv4f32(
+;
+; DISABLE-SHUFFLE-PADDING-LABEL: @test1(
+; DISABLE-SHUFFLE-PADDING:       vector.body:
+; DISABLE-SHUFFLE-PADDING-NOT:     call void @llvm.aarch64.sve.st3.nxv4f32(
+
+entry:
+  %cmp19 = icmp sgt i32 %num, 0
+  br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %num to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.patic, ptr %ps, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %mul = fmul fast float %0, 0x40019999A0000000
+  store float %mul, ptr %arrayidx, align 4
+  %z = getelementptr inbounds %struct.patic, ptr %arrayidx, i64 0, i32 2
+  %1 = load float, ptr %z, align 4
+  %mul6 = fmul fast float %1, 0x40019999A0000000
+  store float %mul6, ptr %z, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; for (int i = 0; i < num; i++) {
+;   ps[i].y = factor * ps[i].y;
+;   ps[i].z = factor * ps[i].z;
+; }
+;
+; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind uwtable vscale_range(2,2)
+define void @test2(i32 noundef %num, ptr nocapture noundef %ps) {
+; ENABLE-SHUFFLE-PADDING-LABEL: @test2(
+; ENABLE-SHUFFLE-PADDING:       vector.body:
+; ENABLE-SHUFFLE-PADDING:         call void @llvm.aarch64.sve.st3.nxv4f32(
+;
+; DISABLE-SHUFFLE-PADDING-LABEL: @test2(
+; DISABLE-SHUFFLE-PADDING:       vector.body:
+; DISABLE-SHUFFLE-PADDING-NOT:     call void @llvm.aarch64.sve.st3.nxv4f32(
+
+entry:
+  %cmp19 = icmp sgt i32 %num, 0
+  br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %num to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.patic, ptr %ps, i64 %indvars.iv
+  %y = getelementptr inbounds %struct.patic, ptr %arrayidx, i64 0, i32 1
+  %0 = load float, ptr %y, align 4
+  %mul = fmul fast float %0, 0x40019999A0000000
+  store float %mul, ptr %y, align 4
+  %z = getelementptr inbounds %struct.patic, ptr %arrayidx, i64 0, i32 2
+  %1 = load float, ptr %z, align 4
+  %mul6 = fmul fast float %1, 0x40019999A0000000
+  store float %mul6, ptr %z, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; Currently, we don't support the following scenario, as shuffle padding
+; requires stored values should be loaded from itself (with some operation on
+; them).
+;
+; for (int i = 0; i < num; i++) {
+;   ps[i].x = i;
+;   ps[i].y = i + 1;
+; }
+;
+; Function Attrs: argmemonly nofree norecurse nosync nounwind writeonly uwtable vscale_range(4,4)
+define dso_local void @test3(i32 noundef %num, ptr nocapture noundef writeonly %ps) {
+; ENABLE-SHUFFLE-PADDING-LABEL: @test3(
+; ENABLE-SHUFFLE-PADDING:       vector.body:
+; ENABLE-SHUFFLE-PADDING-NOT:     call void @llvm.aarch64.sve.st3.nxv4f32(
+;
+; DISABLE-SHUFFLE-PADDING-LABEL: @test3(
+; DISABLE-SHUFFLE-PADDING:       vector.body:
+; DISABLE-SHUFFLE-PADDING-NOT:     call void @llvm.aarch64.sve.st3.nxv4f32(
+entry:
+  %cmp10 = icmp sgt i32 %num, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %num to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %0 = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %0 to float
+  %arrayidx = getelementptr inbounds %struct.patic, ptr %ps, i64 %indvars.iv
+  store float %conv, ptr %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %1 = trunc i64 %indvars.iv.next to i32
+  %conv1 = sitofp i32 %1 to float
+  %y = getelementptr inbounds %struct.patic, ptr %arrayidx, i64 0, i32 1
+  store float %conv1, ptr %y, align 4
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, la...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/75329