[llvm] [LoopVectorize] Enable shuffle padding for masked interleaved accesses (PR #75329)

via llvm-commits llvm-commits at lists.llvm.org
Wed Dec 13 04:25:28 PST 2023


https://github.com/huhu233 updated https://github.com/llvm/llvm-project/pull/75329

>From b93600ffef906f85917119ccdec566b4273d6588 Mon Sep 17 00:00:00 2001
From: zhangtiehu <z00562868 at china.huawei.com>
Date: Wed, 13 Dec 2023 20:02:10 +0800
Subject: [PATCH] [LoopVectorize] Enable shuffle padding for masked interleaved
 accesses

typedef struct {
    float x;
    float y;
    float z;
} patic;

for (int i = 0; i < num; i++) {
  ps[i].x = factor * ps[i].x;
  ps[i].y = factor * ps[i].y;
}

This patch pads the gap of the interleave store group to eliminate
masked.store, which helps to generate better code in Interleaved Access pass,
as shown,

%wide.vec = load <12 x float>; 0,1,2,3,...,11
%shuffle1 = shuffle %wide.vec, poison, <0, 3, 6, 9> ; 0,3,6,9
%shuffle2 = shuffle %wide.vec, poison, <1, 4, 7, 10> ; 1,4,7,10
%padded = shuffle %wide.vec, poison, <2, 5, 8, 11> ; 2,5,8,11

%concate1 = shuffle %op1, %op2, <0, 1, ..., 7> ; 0,3,6,9,1,4,7,10
%concate2 = shuffle %padded, poison,
            <0, 1, ..., 3, undef, undef, undef, undef> ; 2,5,8,11,poison,...,poison
%concateFinal = shuffle %concate1, %concate2,
                <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; 0,1,2,3,...,11
store <12 x float> %concateFinal

This patch adds some restrictions for shuffle padding, that is
target interleaved store groups should have matched interleaved
load groups, which means,
1. The value operand of StoreInst should comes from LoadInst.
2. The store group and the load group accesses the same underlying
   object.
---
 llvm/include/llvm/Analysis/VectorUtils.h      |   7 +
 llvm/lib/Analysis/VectorUtils.cpp             |  23 ++
 .../Transforms/Vectorize/LoopVectorize.cpp    |  72 ++++-
 .../AArch64/sve-structured-store-cost.ll      |  47 +++
 .../AArch64/sve-structured-store.ll           | 279 ++++++++++++++++++
 5 files changed, 423 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/sve-structured-store-cost.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/sve-structured-store.ll

diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 55a6aa645a86e2..01727474432aeb 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -819,6 +819,13 @@ class InterleavedAccessInfo {
   /// Returns true if we have any interleave groups.
   bool hasGroups() const { return !InterleaveGroups.empty(); }
 
+  /// Check if the interleaved store group has matched load group, which means
+  /// the store should be satisfied with some restrictions,
+  /// 1. The value operand of StoreInst should comes from LoadInst.
+  /// 2. The store group and the load group accesses the same memory.
+  Value *hasMatchedLoadGroupForStore(Instruction *Inst, BasicBlock *BB,
+                                     Value *Ptr) const;
+
 private:
   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
   /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 91d8c31fa062de..b1ef36109c1669 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1441,6 +1441,29 @@ void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
   RequiresScalarEpilogue = false;
 }
 
+Value *InterleavedAccessInfo::hasMatchedLoadGroupForStore(Instruction *Inst,
+                                                          BasicBlock *BB,
+                                                          Value *Ptr) const {
+  if (isa<PHINode>(Inst) || Inst->getParent() != BB)
+    return nullptr;
+
+  if (isa<LoadInst>(Inst)) {
+    Value *V = getUnderlyingObject(Inst->getOperand(0));
+    auto Group = getInterleaveGroup(Inst);
+    if (Group && (V == Ptr))
+      return Group->getInsertPos();
+  }
+
+  for (unsigned It = 0; It < Inst->getNumOperands(); It++) {
+    if (Instruction *I = dyn_cast<Instruction>(Inst->getOperand(It)))
+      if (Value *MatchedLoadGroupEntry =
+              hasMatchedLoadGroupForStore(I, BB, Ptr))
+        return MatchedLoadGroupEntry;
+  }
+
+  return nullptr;
+}
+
 template <typename InstT>
 void InterleaveGroup<InstT>::addMetadata(InstT *NewInst) const {
   llvm_unreachable("addMetadata can only be used for Instruction");
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f82e161fb846d1..09daa3d32d3b3e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -408,6 +408,10 @@ static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
 // after prolog. See `emitIterationCountCheck`.
 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
 
+static cl::opt<bool> EnableShufflePadding(
+    "enable-shuffle-padding", cl::init(true), cl::Hidden,
+    cl::desc("Enable shuffle padding to generate structure store."));
+
 /// A helper function that returns true if the given type is irregular. The
 /// type is irregular if its allocated size doesn't equal the store size of an
 /// element of the corresponding vector type.
@@ -796,6 +800,11 @@ class InnerLoopVectorizer {
   // correct start value of reduction PHIs when vectorizing the epilogue.
   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
       ReductionResumeValues;
+
+  /// The map stores shuffles which are used to pad the gap of the interleaved
+  /// store groups. The key for the map is the entry of the load group who is
+  /// matched to the related store group.
+  MapVector<Value *, SmallVector<SmallVector<Value *, 4>, 4>> PaddedShufflesMap;
 };
 
 class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -1702,6 +1711,11 @@ class LoopVectorizationCostModel {
   /// \p VF is the vectorization factor chosen for the original loop.
   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
 
+  Value *hasMatchedLoadGroupForStore(Instruction *Inst, BasicBlock *BB,
+                                     Value *Ptr) const {
+    return InterleaveInfo.hasMatchedLoadGroupForStore(Inst, BB, Ptr);
+  }
+
 private:
   unsigned NumPredStores = 0;
 
@@ -2557,6 +2571,16 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
                        : ShuffledMask;
   };
 
+  Value *MatchedLoad = nullptr;
+  bool IsShufflePadding = false;
+  if (EnableShufflePadding && useMaskedInterleavedAccesses(*TTI) &&
+      TTI->enableScalableVectorization()) {
+    IsShufflePadding = true;
+    if (isa<StoreInst>(Instr) && (Group->getNumMembers() != Group->getFactor()))
+      MatchedLoad = Cost->hasMatchedLoadGroupForStore(
+          Instr, Instr->getParent(), getUnderlyingObject(Instr->getOperand(1)));
+  }
+
   // Vectorize the interleaved load group.
   if (isa<LoadInst>(Instr)) {
     Value *MaskForGaps = nullptr;
@@ -2626,8 +2650,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
     for (unsigned I = 0; I < InterleaveFactor; ++I) {
       Instruction *Member = Group->getMember(I);
 
-      // Skip the gaps in the group.
-      if (!Member)
+      SmallVector<Value *, 4> Shuffles;
+      // Skip the gaps in the group if there are no paddings.
+      if (!Member && !IsShufflePadding)
         continue;
 
       auto StrideMask =
@@ -2636,6 +2661,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
         Value *StridedVec = Builder.CreateShuffleVector(
             NewLoads[Part], StrideMask, "strided.vec");
 
+        if (!Member) {
+          if (Group->isReverse())
+            StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
+          Shuffles.push_back(StridedVec);
+          continue;
+        }
         // If this member has different type, cast the result type.
         if (Member->getType() != ScalarTy) {
           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
@@ -2646,9 +2677,13 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
         if (Group->isReverse())
           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
 
+        Shuffles.push_back(StridedVec);
+
         State.set(VPDefs[J], StridedVec, Part);
       }
-      ++J;
+      PaddedShufflesMap[Instr].push_back(Shuffles);
+      if (Member)
+        ++J;
     }
     return;
   }
@@ -2672,6 +2707,24 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
              "Fail to get a member from an interleaved store group");
       Instruction *Member = Group->getMember(i);
 
+      if (!Member && MatchedLoad) {
+        // %wide.vec = load <12 x float>; 0,1,2,3,...,11
+        // %shuffle1 = shuffle %wide.vec, poison, <0, 3, 6, 9> ; 0,3,6,9
+        // %shuffle2 = shuffle %wide.vec, poison, <1, 4, 7, 10> ; 1,4,7,10
+        // %padded = shuffle %wide.vec, poison, <2, 5, 8, 11> ; 2,5,8,11
+        //
+        // %concate1 = shuffle %op1, %op2, <0, 1, ..., 7> ; 0,3,6,9,1,4,7,10
+        // %concate2 = shuffle %padded, poison,
+        //    <0, 1, ..., 3, undef, undef, undef, undef>
+        //    ; 2,5,8,11,poison,...,poison
+        // %concateFinal = shuffle %concate1, %concate2,
+        //    <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; 0,1,2,3,...,11
+        // store <12 x float> %concateFinal
+        Value *PaddedShuffle = PaddedShufflesMap[MatchedLoad][i][Part];
+        StoredVecs.push_back(PaddedShuffle);
+        continue;
+      }
+
       // Skip the gaps in the group.
       if (!Member) {
         Value *Undef = PoisonValue::get(SubVT);
@@ -2696,7 +2749,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
     // Interleave all the smaller vectors into one wider vector.
     Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
     Instruction *NewStoreInstr;
-    if (BlockInMask || MaskForGaps) {
+    if ((BlockInMask || MaskForGaps) && !MatchedLoad) {
       Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
                                                 Group->getAlign(), GroupMask);
@@ -6325,10 +6378,19 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
     if (Group->getMember(IF))
       Indices.push_back(IF);
 
+  bool IsShufflePaddingStore = false;
+  if (EnableShufflePadding && useMaskedInterleavedAccesses(TTI) &&
+      TTI.enableScalableVectorization() && !VF.isScalable())
+    IsShufflePaddingStore = true;
+
   // Calculate the cost of the whole interleaved group.
+  // If shuffle padding is enabled, ignore gaps.
   bool UseMaskForGaps =
       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
-      (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
+      (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()) &&
+       (!IsShufflePaddingStore ||
+        !hasMatchedLoadGroupForStore(I, I->getParent(),
+                                     getUnderlyingObject(I->getOperand(1)))));
   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
       AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-structured-store-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-structured-store-cost.ll
new file mode 100644
index 00000000000000..79a1d92cd1454b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-structured-store-cost.ll
@@ -0,0 +1,47 @@
+; REQUIRES: asserts
+; RUN: opt -enable-shuffle-padding=true -enable-masked-interleaved-mem-accesses=true -passes=loop-vectorize -debug-only=loop-vectorize  -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=512 -S < %s 2>&1  | FileCheck %s --check-prefixes=PADDING
+; RUN: opt -enable-shuffle-padding=false -enable-masked-interleaved-mem-accesses=true -passes=loop-vectorize -debug-only=loop-vectorize  -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=512 -S < %s 2>&1  | FileCheck %s --check-prefixes=NO-PADDING
+
+%struct.patic = type { float, float, float }
+
+; for (int i = 0; i < num; i++) {
+;   ps[i].x = factor * ps[i].x;
+;   ps[i].y = factor * ps[i].y;
+; }
+;
+; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind uwtable vscale_range(2,2)
+define void @shufflePadding(i32 noundef %num, ptr nocapture noundef %ps) {
+; PADDING-LABEL: 'shufflePadding'
+; PADDING: LV: Found an estimated cost of 3 for VF 16 For instruction:   store float %mul6, ptr %y, align 4
+
+; NO-PADDING-LABEL: 'shufflePadding'
+; NO-PADDING: LV: Found an estimated cost of 188 for VF 16 For instruction:   store float %mul6, ptr %y, align 4
+entry:
+  %cmp19 = icmp sgt i32 %num, 0
+  br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %num to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.patic, ptr %ps, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %mul = fmul fast float %0, 0x40019999A0000000
+  store float %mul, ptr %arrayidx, align 4
+  %y = getelementptr inbounds %struct.patic, ptr %arrayidx, i64 0, i32 1
+  %1 = load float, ptr %y, align 4
+  %mul6 = fmul fast float %1, 0x40019999A0000000
+  store float %mul6, ptr %y, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-structured-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-structured-store.ll
new file mode 100644
index 00000000000000..6c02274073e585
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-structured-store.ll
@@ -0,0 +1,279 @@
+; RUN: opt -mtriple=aarch64 -mattr=+sve -S -passes=loop-vectorize,interleaved-access  -enable-masked-interleaved-mem-accesses=true -sve-gather-overhead=10 -sve-scatter-overhead=10  -enable-shuffle-padding=true -force-vector-width=16 -aarch64-sve-vector-bits-min=512 < %s -o - | FileCheck %s --check-prefixes=ENABLE-SHUFFLE-PADDING
+; RUN: opt -mtriple=aarch64 -mattr=+sve -S -passes=loop-vectorize,interleaved-access  -enable-masked-interleaved-mem-accesses=true -sve-gather-overhead=10 -sve-scatter-overhead=10  -enable-shuffle-padding=false -force-vector-width=16  -aarch64-sve-vector-bits-min=512 < %s -o - | FileCheck %s --check-prefixes=DISABLE-SHUFFLE-PADDING
+
+%struct.patic = type { float, float, float }
+
+; for (int i = 0; i < num; i++) {
+;   ps[i].x = factor * ps[i].x;
+;   ps[i].y = factor * ps[i].y;
+; }
+;
+; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind uwtable vscale_range(2,2)
+define void @test(i32 noundef %num, ptr nocapture noundef %ps) {
+; ENABLE-SHUFFLE-PADDING-LABEL: @test(
+; ENABLE-SHUFFLE-PADDING:       vector.body:
+; ENABLE-SHUFFLE-PADDING:         call void @llvm.aarch64.sve.st3.nxv4f32(
+;
+; DISABLE-SHUFFLE-PADDING-LABEL: @test(
+; DISABLE-SHUFFLE-PADDING:       vector.body:
+; DISABLE-SHUFFLE-PADDING-NOT:     call void @llvm.aarch64.sve.st3.nxv4f32(
+
+entry:
+  %cmp19 = icmp sgt i32 %num, 0
+  br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %num to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.patic, ptr %ps, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %mul = fmul fast float %0, 0x40019999A0000000
+  store float %mul, ptr %arrayidx, align 4
+  %y = getelementptr inbounds %struct.patic, ptr %arrayidx, i64 0, i32 1
+  %1 = load float, ptr %y, align 4
+  %mul6 = fmul fast float %1, 0x40019999A0000000
+  store float %mul6, ptr %y, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; for (int i = 0; i < num; i++) {
+;   ps[i].x = factor * ps[i].x;
+;   ps[i].z = factor * ps[i].z;
+; }
+;
+; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind uwtable vscale_range(2,2)
+define void @test1(i32 noundef %num, ptr nocapture noundef %ps) {
+; ENABLE-SHUFFLE-PADDING-LABEL: @test1(
+; ENABLE-SHUFFLE-PADDING:       vector.body:
+; ENABLE-SHUFFLE-PADDING:         call void @llvm.aarch64.sve.st3.nxv4f32(
+;
+; DISABLE-SHUFFLE-PADDING-LABEL: @test1(
+; DISABLE-SHUFFLE-PADDING:       vector.body:
+; DISABLE-SHUFFLE-PADDING-NOT:     call void @llvm.aarch64.sve.st3.nxv4f32(
+
+entry:
+  %cmp19 = icmp sgt i32 %num, 0
+  br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %num to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.patic, ptr %ps, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %mul = fmul fast float %0, 0x40019999A0000000
+  store float %mul, ptr %arrayidx, align 4
+  %z = getelementptr inbounds %struct.patic, ptr %arrayidx, i64 0, i32 2
+  %1 = load float, ptr %z, align 4
+  %mul6 = fmul fast float %1, 0x40019999A0000000
+  store float %mul6, ptr %z, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; for (int i = 0; i < num; i++) {
+;   ps[i].y = factor * ps[i].y;
+;   ps[i].z = factor * ps[i].z;
+; }
+;
+; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind uwtable vscale_range(2,2)
+define void @test2(i32 noundef %num, ptr nocapture noundef %ps) {
+; ENABLE-SHUFFLE-PADDING-LABEL: @test2(
+; ENABLE-SHUFFLE-PADDING:       vector.body:
+; ENABLE-SHUFFLE-PADDING:         call void @llvm.aarch64.sve.st3.nxv4f32(
+;
+; DISABLE-SHUFFLE-PADDING-LABEL: @test2(
+; DISABLE-SHUFFLE-PADDING:       vector.body:
+; DISABLE-SHUFFLE-PADDING-NOT:     call void @llvm.aarch64.sve.st3.nxv4f32(
+
+entry:
+  %cmp19 = icmp sgt i32 %num, 0
+  br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %num to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.patic, ptr %ps, i64 %indvars.iv
+  %y = getelementptr inbounds %struct.patic, ptr %arrayidx, i64 0, i32 1
+  %0 = load float, ptr %y, align 4
+  %mul = fmul fast float %0, 0x40019999A0000000
+  store float %mul, ptr %y, align 4
+  %z = getelementptr inbounds %struct.patic, ptr %arrayidx, i64 0, i32 2
+  %1 = load float, ptr %z, align 4
+  %mul6 = fmul fast float %1, 0x40019999A0000000
+  store float %mul6, ptr %z, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; Currently, we don't support the following scenario, as shuffle padding
+; requires stored values should be loaded from itself (with some operation on
+; them).
+;
+; for (int i = 0; i < num; i++) {
+;   ps[i].x = i;
+;   ps[i].y = i + 1;
+; }
+;
+; Function Attrs: argmemonly nofree norecurse nosync nounwind writeonly uwtable vscale_range(4,4)
+define dso_local void @test3(i32 noundef %num, ptr nocapture noundef writeonly %ps) {
+; ENABLE-SHUFFLE-PADDING-LABEL: @test3(
+; ENABLE-SHUFFLE-PADDING:       vector.body:
+; ENABLE-SHUFFLE-PADDING-NOT:     call void @llvm.aarch64.sve.st3.nxv4f32(
+;
+; DISABLE-SHUFFLE-PADDING-LABEL: @test3(
+; DISABLE-SHUFFLE-PADDING:       vector.body:
+; DISABLE-SHUFFLE-PADDING-NOT:     call void @llvm.aarch64.sve.st3.nxv4f32(
+entry:
+  %cmp10 = icmp sgt i32 %num, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %num to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %0 = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %0 to float
+  %arrayidx = getelementptr inbounds %struct.patic, ptr %ps, i64 %indvars.iv
+  store float %conv, ptr %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %1 = trunc i64 %indvars.iv.next to i32
+  %conv1 = sitofp i32 %1 to float
+  %y = getelementptr inbounds %struct.patic, ptr %arrayidx, i64 0, i32 1
+  store float %conv1, ptr %y, align 4
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; The feature of the case is the load group has no gap (all member is used),
+; but the store group has gap (ps[i].z is not stored).
+;
+; for (int i = 0; i < num; i++) {
+;   ps[i].x += ps[i].y;
+;   ps[i].y += ps[i].z;
+; }
+;
+; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind uwtable vscale_range(4,4)
+define dso_local void @test4(i32 noundef %num, ptr nocapture noundef %ps, i32 noundef %x, i32 noundef %y, i32 noundef %z) {
+; ENABLE-SHUFFLE-PADDING-LABEL: @test4(
+; ENABLE-SHUFFLE-PADDING:       vector.body:
+; ENABLE-SHUFFLE-PADDING:         call void @llvm.aarch64.sve.st3.nxv4f32(
+;
+; DISABLE-SHUFFLE-PADDING-LABEL: @test4(
+; DISABLE-SHUFFLE-PADDING:       vector.body:
+; DISABLE-SHUFFLE-PADDING-NOT:     call void @llvm.aarch64.sve.st3.nxv4f32(
+entry:
+  %cmp20 = icmp sgt i32 %num, 0
+  br i1 %cmp20, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %num to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.patic, ptr %ps, i64 %indvars.iv
+  %y1 = getelementptr inbounds %struct.patic, ptr %arrayidx, i64 0, i32 1
+  %0 = load float, ptr %y1, align 4
+  %1 = load float, ptr %arrayidx, align 4
+  %add = fadd contract float %0, %1
+  store float %add, ptr %arrayidx, align 4
+  %z7 = getelementptr inbounds %struct.patic, ptr %arrayidx, i64 0, i32 2
+  %2 = load float, ptr %z7, align 4
+  %add11 = fadd contract float %0, %2
+  store float %add11, ptr %y1, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; The feature of the case is struct member is cross used by each other, it is
+; also supported.
+;
+; for (int i = 0; i < num; i++) {
+;   ps[i].x += ps[i].y;
+;   ps[i].y += ps[i].x;
+; }
+;
+; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind uwtable vscale_range(4,4)
+define dso_local void @test5(i32 noundef %num, ptr nocapture noundef %ps, i32 noundef %x, i32 noundef %y, i32 noundef %z) {
+; ENABLE-SHUFFLE-PADDING-LABEL: @test5(
+; ENABLE-SHUFFLE-PADDING:       vector.body:
+; ENABLE-SHUFFLE-PADDING:         call void @llvm.aarch64.sve.st3.nxv4f32(
+;
+; DISABLE-SHUFFLE-PADDING-LABEL: @test5(
+; DISABLE-SHUFFLE-PADDING:       vector.body:
+; DISABLE-SHUFFLE-PADDING-NOT:     call void @llvm.aarch64.sve.st3.nxv4f32(
+entry:
+  %cmp20 = icmp sgt i32 %num, 0
+  br i1 %cmp20, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %num to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.patic, ptr %ps, i64 %indvars.iv
+  %y1 = getelementptr inbounds %struct.patic, ptr %arrayidx, i64 0, i32 1
+  %0 = load float, ptr %y1, align 4
+  %1 = load float, ptr %arrayidx, align 4
+  %add = fadd contract float %0, %1
+  store float %add, ptr %arrayidx, align 4
+  %add11 = fadd contract float %0, %add
+  store float %add11, ptr %y1, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+



More information about the llvm-commits mailing list