[llvm] [LV] Fix gap mask requirement for interleaved access (PR #151105)
Mel Chen via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 30 00:56:27 PDT 2025
https://github.com/Mel-Chen updated https://github.com/llvm/llvm-project/pull/151105
>From 6993d3c8af529ed48258f6116eae81977022ee16 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 29 Jul 2025 02:19:01 -0700
Subject: [PATCH 1/5] nfc, isFull for group
---
llvm/include/llvm/Analysis/VectorUtils.h | 3 +++
llvm/lib/Analysis/VectorUtils.cpp | 6 +++---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++--
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 8 ++------
4 files changed, 10 insertions(+), 11 deletions(-)
diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index b55c4e0a6bf76..7a6b16cedc6a3 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -633,6 +633,9 @@ template <typename InstTy> class InterleaveGroup {
return true;
}
+ /// Return true if the group has no gaps.
+ bool isFull() const { return getNumMembers() == getFactor(); }
+
private:
uint32_t Factor; // Interleave Factor.
bool Reverse;
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 150ddced03b15..b3b4c37475eef 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1117,7 +1117,7 @@ Constant *
llvm::createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF,
const InterleaveGroup<Instruction> &Group) {
// All 1's means mask is not needed.
- if (Group.getNumMembers() == Group.getFactor())
+ if (Group.isFull())
return nullptr;
// TODO: support reversed access.
@@ -1663,7 +1663,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
// Case 1: A full group. Can Skip the checks; For full groups, if the wide
// load would wrap around the address space we would do a memory access at
// nullptr even without the transformation.
- if (Group->getNumMembers() == Group->getFactor())
+ if (Group->isFull())
continue;
// Case 2: If first and last members of the group don't wrap this implies
@@ -1698,7 +1698,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
// Case 1: A full group. Can Skip the checks; For full groups, if the wide
// store would wrap around the address space we would do a memory access at
// nullptr even without the transformation.
- if (Group->getNumMembers() == Group->getFactor())
+ if (Group->isFull())
continue;
// Interleave-store-group with gaps is implemented using masked wide store.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7b7efb8c6309e..9c06ef990a600 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3131,7 +3131,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
!isScalarEpilogueAllowed();
bool StoreAccessWithGapsRequiresMasking =
- isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
+ isa<StoreInst>(I) && !Group->isFull();
if (!PredicatedAccessRequiresMasking &&
!LoadAccessWithGapsRequiresEpilogMasking &&
!StoreAccessWithGapsRequiresMasking)
@@ -5349,7 +5349,7 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
// Calculate the cost of the whole interleaved group.
bool UseMaskForGaps =
(Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
- (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
+ (isa<StoreInst>(I) && !Group->isFull());
InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8de05c16041fa..359e40821aa4d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3141,9 +3141,7 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV;
if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
- return IR->getInterleaveGroup()->getFactor() ==
- IR->getInterleaveGroup()->getNumMembers() &&
- IR->getVPValue(Idx) == OpV;
+ return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
return false;
}
@@ -3260,9 +3258,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
if (!DefR)
return false;
auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
- return IR &&
- IR->getInterleaveGroup()->getFactor() ==
- IR->getInterleaveGroup()->getNumMembers() &&
+ return IR && IR->getInterleaveGroup()->isFull() &&
IR->getVPValue(Op.index()) == Op.value();
})) {
StoreGroups.push_back(InterleaveR);
>From 51b6f48c9ec4164d8fe893fa8b1fbfdb49a8925c Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 29 Jul 2025 01:26:28 -0700
Subject: [PATCH 2/5] fix
---
.../Transforms/Vectorize/VPlanTransforms.cpp | 3 +-
.../RISCV/interleaved-store-with-gap.ll | 65 +++++++++++++++++++
2 files changed, 67 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/interleaved-store-with-gap.ll
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 359e40821aa4d..c1f1818a99e5a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2493,7 +2493,8 @@ void VPlanTransforms::createInterleaveGroups(
}
bool NeedsMaskForGaps =
- IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed;
+ (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
+ (!StoredValues.empty() && (IG->getNumMembers() < IG->getFactor()));
Instruction *IRInsertPos = IG->getInsertPos();
auto *InsertPos =
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-store-with-gap.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-store-with-gap.ll
new file mode 100644
index 0000000000000..8f504cba4242f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-store-with-gap.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=riscv64 -mattr=+v -passes=loop-vectorize -mattr=+v \
+; RUN: -scalable-vectorization=off -enable-masked-interleaved-mem-accesses \
+; RUN: -force-vector-interleave=1 -riscv-v-vector-bits-min=1024 -S < %s | FileCheck %s
+
+define void @store_factor_2_with_tail_gap(i64 %n, ptr %a) {
+; CHECK-LABEL: define void @store_factor_2_with_tail_gap(
+; CHECK-SAME: i64 [[N:%.*]], ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i64> [[VEC_IND]], <16 x i64> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i64> [[TMP2]], <32 x i64> poison, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT: call void @llvm.masked.store.v32i64.p0(<32 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], i32 8, <32 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = shl nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT: store i64 [[INDVARS_IV]], ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %0 = shl nsw i64 %indvars.iv, 1
+ %arrayidx = getelementptr inbounds i64, ptr %a, i64 %0
+ store i64 %indvars.iv, ptr %arrayidx, align 8
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+ br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+ ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
>From 68f81ee40d61ccafcea9c5b65ef6f9343c932c45 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 30 Jul 2025 00:16:04 -0700
Subject: [PATCH 3/5] apply isFull
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c1f1818a99e5a..2dc83d535c559 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2494,7 +2494,7 @@ void VPlanTransforms::createInterleaveGroups(
bool NeedsMaskForGaps =
(IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
- (!StoredValues.empty() && (IG->getNumMembers() < IG->getFactor()));
+ (!StoredValues.empty() && !IG->isFull());
Instruction *IRInsertPos = IG->getInsertPos();
auto *InsertPos =
>From 0cccb2df8fba0e8eadf38523227f83c9f8ca6675 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 30 Jul 2025 00:53:54 -0700
Subject: [PATCH 4/5] harden assertion
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 68e7c20a070f4..31e9fa3c32cf1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3548,6 +3548,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
// Vectorize the interleaved store group.
Value *MaskForGaps =
createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
+ assert(((MaskForGaps != nullptr) == NeedsMaskForGaps) &&
+ "Mismatch between NeedsMaskForGaps and MaskForGaps");
assert((!MaskForGaps || !State.VF.isScalable()) &&
"masking gaps for scalable vectors is not yet supported.");
ArrayRef<VPValue *> StoredValues = getStoredValues();
>From 5bfcb300947980bc2030bfbed9c2a5ceae66724f Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 30 Jul 2025 00:55:51 -0700
Subject: [PATCH 5/5] apply --check-globals=none
---
.../LoopVectorize/RISCV/interleaved-store-with-gap.ll | 8 +-------
1 file changed, 1 insertion(+), 7 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-store-with-gap.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-store-with-gap.ll
index 8f504cba4242f..27cd87f1bd7f9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-store-with-gap.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-store-with-gap.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
; RUN: opt -mtriple=riscv64 -mattr=+v -passes=loop-vectorize -mattr=+v \
; RUN: -scalable-vectorization=off -enable-masked-interleaved-mem-accesses \
; RUN: -force-vector-interleave=1 -riscv-v-vector-bits-min=1024 -S < %s | FileCheck %s
@@ -57,9 +57,3 @@ for.body:
exit:
ret void
}
-;.
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-;.
More information about the llvm-commits
mailing list