[llvm] [LoopVectorize] Vectorize the compact pattern (PR #68980)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 13 04:48:34 PDT 2023
https://github.com/huhu233 created https://github.com/llvm/llvm-project/pull/68980
This patch tries to vectorize the compact pattern, as shown,
for(i=0; i<N; i++){
x = comp[i];
if(x<a) Out_ref[n++]=B[i];
}
It introduces some changes:
1. Add a pattern matching in LoopVectorizationLegality to cache specific cases.
2. Introduce two new recipes to hande the compact chain:
VPCompactPHIRecipe: Handle the entry PHI of compact chain.
VPWidenCompactInstructionRecipe: Handle other instructions in compact chain.
3. Slightly adapt the cost model for compact pattern.
>From 73a1356bf5a05f738675345be497e8ea291cedb7 Mon Sep 17 00:00:00 2001
From: huhu233 <1293577861 at qq.com>
Date: Fri, 13 Oct 2023 19:36:39 +0800
Subject: [PATCH 1/2] [LoopVectorize] Precommit a test for the compact pattern
---
.../LoopVectorize/AArch64/compact.ll | 79 +++++++++++++++++++
1 file changed, 79 insertions(+)
create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/compact.ll
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/compact.ll b/llvm/test/Transforms/LoopVectorize/AArch64/compact.ll
new file mode 100644
index 000000000000000..0bafd325bb8e98e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/compact.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize -S < %s -o - | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; Function Attrs: argmemonly nofree norecurse nosync nounwind uwtable vscale_range(1,16)
+define dso_local i32 @kernel_reference(i32 noundef %N, i32 noundef %a, ptr noalias nocapture noundef readonly %comp, ptr noalias nocapture noundef writeonly %Out_ref, ptr nocapture noundef readonly %B, ptr noalias nocapture noundef readnone %Out1) #0 {
+; CHECK-LABEL: @kernel_reference(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT: [[N_013:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_1:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COMP:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[A:%.*]]
+; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK: if.then:
+; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[N_013]], 1
+; CHECK-NEXT: [[IDXPROM4:%.*]] = sext i32 [[N_013]] to i64
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[OUT_REF:%.*]], i64 [[IDXPROM4]]
+; CHECK-NEXT: store i32 [[TMP1]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT: br label [[FOR_INC]]
+; CHECK: for.inc:
+; CHECK-NEXT: [[N_1]] = phi i32 [ [[INC]], [[IF_THEN]] ], [ [[N_013]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK: for.end.loopexit:
+; CHECK-NEXT: [[N_1_LCSSA:%.*]] = phi i32 [ [[N_1]], [[FOR_INC]] ]
+; CHECK-NEXT: br label [[FOR_END]]
+; CHECK: for.end:
+; CHECK-NEXT: [[N_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[N_1_LCSSA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT: ret i32 [[N_0_LCSSA]]
+;
+entry:
+ %cmp11 = icmp sgt i32 %N, 0
+ br i1 %cmp11, label %for.body.preheader, label %for.end
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %n.013 = phi i32 [ 0, %for.body.preheader ], [ %n.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i32, ptr %comp, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %cmp1 = icmp slt i32 %0, %a
+ br i1 %cmp1, label %if.then, label %for.inc
+
+if.then: ; preds = %for.body
+ %arrayidx3 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+ %1 = load i32, ptr %arrayidx3, align 4
+ %inc = add nsw i32 %n.013, 1
+ %idxprom4 = sext i32 %n.013 to i64
+ %arrayidx5 = getelementptr inbounds i32, ptr %Out_ref, i64 %idxprom4
+ store i32 %1, ptr %arrayidx5, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body, %if.then
+ %n.1 = phi i32 [ %inc, %if.then ], [ %n.013, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end: ; preds = %for.inc, %entry
+ %n.0.lcssa = phi i32 [ 0, %entry ], [ %n.1, %for.inc ]
+ ret i32 %n.0.lcssa
+}
+
+attributes #0 = { argmemonly nofree norecurse nosync nounwind uwtable vscale_range(1,16) "target-cpu"="generic" "target-features"="+neon,+sve,+v8.2a"}
>From 74b4fe41e5428071c22690585a190e2aeacbea92 Mon Sep 17 00:00:00 2001
From: huhu233 <1293577861 at qq.com>
Date: Fri, 13 Oct 2023 19:38:06 +0800
Subject: [PATCH 2/2] [LoopVectorize] Vectorize the compact pattern
This patch tries to vectorize the compact pattern, as shown,
for (i = 0; i < N; i++) {
x = comp[i];
if(x<a) Out_ref[n++]=B[i];
}
It introduces some changes:
1.Add a pattern matching in LoopVectorizationLegality to cache
specific cases.
2.Introduce two new recipes to hande the compact chain:
VPCompactPHIRecipe: Handle the entry PHI of compact chain.
VPWidenCompactInstructionRecipe: Handle other instructions in compact chain.
3.Slightly adapt the cost model for compact pattern.
---
.../llvm/Analysis/TargetTransformInfo.h | 25 +++
.../llvm/Analysis/TargetTransformInfoImpl.h | 7 +
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 4 +
.../include/llvm/Transforms/Utils/LoopUtils.h | 8 +
.../Vectorize/LoopVectorizationLegality.h | 35 ++++
llvm/lib/Analysis/TargetTransformInfo.cpp | 16 ++
.../SelectionDAG/LegalizeIntegerTypes.cpp | 12 ++
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 +
.../AArch64/AArch64TargetTransformInfo.cpp | 2 +
.../AArch64/AArch64TargetTransformInfo.h | 10 ++
llvm/lib/Transforms/Utils/LoopUtils.cpp | 29 ++++
.../Vectorize/LoopVectorizationLegality.cpp | 157 ++++++++++++++++++
.../Transforms/Vectorize/LoopVectorize.cpp | 116 ++++++++++++-
.../Transforms/Vectorize/VPRecipeBuilder.h | 5 +
llvm/lib/Transforms/Vectorize/VPlan.cpp | 17 ++
llvm/lib/Transforms/Vectorize/VPlan.h | 62 +++++++
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 135 +++++++++++++++
llvm/lib/Transforms/Vectorize/VPlanValue.h | 2 +
.../LoopVectorize/AArch64/compact-vplan.ll | 78 +++++++++
.../LoopVectorize/AArch64/compact.ll | 96 +++++++++--
20 files changed, 801 insertions(+), 16 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/compact-vplan.ll
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 5234ef8788d9e96..c2851c10e6ff3ef 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1672,6 +1672,11 @@ class TargetTransformInfo {
/// \return The maximum number of function arguments the target supports.
unsigned getMaxNumArgs() const;
+ InstructionCost getCompactCost() const;
+ bool isTargetSupportedCompactStore() const;
+ unsigned getTargetSupportedCompact() const;
+ unsigned getTargetSupportedCNTP() const;
+
/// @}
private:
@@ -2041,6 +2046,10 @@ class TargetTransformInfo::Concept {
getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
virtual bool hasArmWideBranch(bool Thumb) const = 0;
virtual unsigned getMaxNumArgs() const = 0;
+ virtual bool isTargetSupportedCompactStore() const = 0;
+ virtual unsigned getTargetSupportedCompact() const = 0;
+ virtual unsigned getTargetSupportedCNTP() const = 0;
+ virtual InstructionCost getCompactCost() const = 0;
};
template <typename T>
@@ -2757,6 +2766,22 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
unsigned getMaxNumArgs() const override {
return Impl.getMaxNumArgs();
}
+
+ bool isTargetSupportedCompactStore() const override {
+ return Impl.isTargetSupportedCompactStore();
+ }
+
+ unsigned getTargetSupportedCompact() const override {
+ return Impl.getTargetSupportedCompact();
+ }
+
+ unsigned getTargetSupportedCNTP() const override {
+ return Impl.getTargetSupportedCNTP();
+ }
+
+ InstructionCost getCompactCost() const override {
+ return Impl.getCompactCost();
+ }
};
template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index c1ff314ae51c98b..e063f383980a724 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -895,6 +895,13 @@ class TargetTransformInfoImplBase {
unsigned getMaxNumArgs() const { return UINT_MAX; }
+ bool isTargetSupportedCompactStore() const { return false; }
+ unsigned getTargetSupportedCompact() const { return 0; }
+ unsigned getTargetSupportedCNTP() const { return 0; }
+ InstructionCost getCompactCost() const {
+ return InstructionCost::getInvalid();
+ }
+
protected:
// Obtain the minimum required size to hold the value (without the sign)
// In case of a vector it returns the min required size for one element.
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 3dd16dafe3c42a7..737757761eca4ab 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -700,6 +700,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return getST()->getMaxPrefetchIterationsAhead();
}
+ virtual InstructionCost getCompactCost() const {
+ return InstructionCost::getInvalid();
+ }
+
virtual bool enableWritePrefetching() const {
return getST()->enableWritePrefetching();
}
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 0d99249be413762..348b8ad03de4179 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -409,6 +409,14 @@ Value *createAnyOfTargetReduction(IRBuilderBase &B, Value *Src,
Value *createTargetReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc,
Value *Src, PHINode *OrigPhi = nullptr);
+Value *createTargetCompact(IRBuilderBase &B, Module *M,
+ const TargetTransformInfo *TTI, Value *Mask,
+ Value *Val);
+
+Value *createTargetCNTP(IRBuilderBase &B, Module *M,
+ const TargetTransformInfo *TTI, Value *Mask,
+ Value *Val);
+
/// Create an ordered reduction intrinsic using the given recurrence
/// descriptor \p Desc.
Value *createOrderedReduction(IRBuilderBase &B,
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 20cfc680e8f90b3..7f82154699e5174 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -224,6 +224,26 @@ class LoopVectorizationRequirements {
Instruction *ExactFPMathInst = nullptr;
};
+class CompactDescriptor {
+ PHINode *LiveOutPhi;
+ bool IsCompactSign;
+ SmallPtrSet<Value *, 8> Chain;
+
+public:
+ CompactDescriptor() = default;
+ CompactDescriptor(SmallPtrSetImpl<Value *> &CompactChain, PHINode *LiveOut,
+ bool IsSign)
+ : LiveOutPhi(LiveOut), IsCompactSign(IsSign) {
+ Chain.insert(CompactChain.begin(), CompactChain.end());
+ }
+
+ bool isInCompactChain(Value *V) const { return Chain.find(V) != Chain.end(); }
+
+ PHINode *getLiveOutPhi() const { return LiveOutPhi; }
+
+ bool isSign() const { return IsCompactSign; }
+};
+
/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
/// to what vectorization factor.
/// This class does not look at the profitability of vectorization, only the
@@ -261,6 +281,8 @@ class LoopVectorizationLegality {
/// inductions and reductions.
using RecurrenceSet = SmallPtrSet<const PHINode *, 8>;
+ using CompactList = MapVector<PHINode *, CompactDescriptor>;
+
/// Returns true if it is legal to vectorize this loop.
/// This does not mean that it is profitable to vectorize this
/// loop, only that it is legal to do so.
@@ -397,6 +419,14 @@ class LoopVectorizationLegality {
DominatorTree *getDominatorTree() const { return DT; }
+ const CompactList &getCompactList() const { return CpList; }
+
+ bool hasCompactChain() const { return CpList.size() > 0; }
+
+ PHINode *getCompactChainStart(Instruction *I) const;
+
+ bool isSign(PHINode *Phi) { return CpList[Phi].isSign(); };
+
private:
/// Return true if the pre-header, exiting and latch blocks of \p Lp and all
/// its nested loops are considered legal for vectorization. These legal
@@ -425,6 +455,8 @@ class LoopVectorizationLegality {
/// and we only need to check individual instructions.
bool canVectorizeInstrs();
+ bool isMatchCompact(PHINode *Phi, Loop *TheLoop, CompactDescriptor &CpDesc);
+
/// When we vectorize loops we may change the order in which
/// we read and write from memory. This method checks if it is
/// legal to vectorize the code, considering only memory constrains.
@@ -538,6 +570,9 @@ class LoopVectorizationLegality {
/// BFI and PSI are used to check for profile guided size optimizations.
BlockFrequencyInfo *BFI;
ProfileSummaryInfo *PSI;
+
+ // Record compact chain in the loop.
+ CompactList CpList;
};
} // namespace llvm
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index aad14f21d114619..b7596bb2e0dfc92 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1248,6 +1248,22 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
}
+bool TargetTransformInfo::isTargetSupportedCompactStore() const {
+ return TTIImpl->isTargetSupportedCompactStore();
+}
+
+unsigned TargetTransformInfo::getTargetSupportedCompact() const {
+ return TTIImpl->getTargetSupportedCompact();
+}
+
+unsigned TargetTransformInfo::getTargetSupportedCNTP() const {
+ return TTIImpl->getTargetSupportedCNTP();
+}
+
+InstructionCost TargetTransformInfo::getCompactCost() const {
+ return TTIImpl->getCompactCost();
+}
+
TargetTransformInfo::Concept::~Concept() = default;
TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index fc9e3ff3734989d..d30d0b57b5d47b0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -22,6 +22,7 @@
#include "llvm/CodeGen/StackMaps.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/raw_ostream.h"
@@ -301,6 +302,11 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
case ISD::FFREXP:
Res = PromoteIntRes_FFREXP(N);
break;
+ case ISD::INTRINSIC_WO_CHAIN:
+ if (N->getConstantOperandVal(0) == Intrinsic::aarch64_sve_compact) {
+ Res = PromoteIntRes_COMPACT(N);
+ break;
+ }
}
// If the result is null then the sub-method took care of registering it.
@@ -5942,6 +5948,12 @@ SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) {
return DAG.getBuildVector(N->getValueType(0), dl, NewOps);
}
+SDValue DAGTypeLegalizer::PromoteIntRes_COMPACT(SDNode *N) {
+ SDValue OpExt = SExtOrZExtPromotedInteger(N->getOperand(2));
+ return DAG.getNode(N->getOpcode(), SDLoc(N), OpExt.getValueType(),
+ N->getOperand(0), N->getOperand(1), OpExt);
+}
+
SDValue DAGTypeLegalizer::ExpandIntOp_STACKMAP(SDNode *N, unsigned OpNo) {
assert(OpNo > 1);
SDValue Op = N->getOperand(OpNo);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index c802604a3470e13..d204169ed2327f7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -364,6 +364,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue PromoteIntRes_FunnelShift(SDNode *N);
SDValue PromoteIntRes_VPFunnelShift(SDNode *N);
SDValue PromoteIntRes_IS_FPCLASS(SDNode *N);
+ SDValue PromoteIntRes_COMPACT(SDNode *N);
// Integer Operand Promotion.
bool PromoteIntegerOperand(SDNode *N, unsigned OpNo);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index d8a0e68d7123759..5ca5f22525d3dd3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3889,3 +3889,5 @@ AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
return AM.Scale != 0 && AM.Scale != 1;
return -1;
}
+
+InstructionCost AArch64TTIImpl::getCompactCost() const { return 6; }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a6baade412c77d2..28bd48e8ed76c4a 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
#include <cstdint>
#include <optional>
@@ -412,6 +413,15 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
}
+
+ bool isTargetSupportedCompactStore() const { return ST->hasSVE(); }
+ unsigned getTargetSupportedCompact() const {
+ return Intrinsic::aarch64_sve_compact;
+ }
+ unsigned getTargetSupportedCNTP() const {
+ return Intrinsic::aarch64_sve_cntp;
+ }
+ InstructionCost getCompactCost() const override;
};
} // end namespace llvm
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 21affe7bdce406e..1373fb7931f0a7e 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -34,6 +34,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h"
@@ -1119,6 +1120,34 @@ Value *llvm::createTargetReduction(IRBuilderBase &B,
return createSimpleTargetReduction(B, Src, RK);
}
+Value *llvm::createTargetCompact(IRBuilderBase &B, Module *M,
+ const TargetTransformInfo *TTI, Value *Mask,
+ Value *Val) {
+ Intrinsic::ID IID = TTI->getTargetSupportedCompact();
+ switch (IID) {
+ default:
+ return nullptr;
+ case Intrinsic::aarch64_sve_compact:
+ Function *CompactMaskDecl = Intrinsic::getDeclaration(
+ M, Intrinsic::aarch64_sve_compact, Val->getType());
+ return B.CreateCall(CompactMaskDecl, {Mask, Val});
+ }
+}
+
+Value *llvm::createTargetCNTP(IRBuilderBase &B, Module *M,
+ const TargetTransformInfo *TTI, Value *Mask,
+ Value *Val) {
+ Intrinsic::ID IID = TTI->getTargetSupportedCNTP();
+ switch (IID) {
+ default:
+ return nullptr;
+ case Intrinsic::aarch64_sve_cntp:
+ Function *CNTPDecl = Intrinsic::getDeclaration(
+ M, Intrinsic::aarch64_sve_cntp, Val->getType());
+ return B.CreateCall(CNTPDecl, {Mask, Val});
+ }
+}
+
Value *llvm::createOrderedReduction(IRBuilderBase &B,
const RecurrenceDescriptor &Desc,
Value *Src, Value *Start) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 35d69df56dc7220..dbab8af159a1621 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -24,6 +24,7 @@
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/Utils/SizeOpts.h"
#include "llvm/Transforms/Vectorize/LoopVectorize.h"
@@ -78,6 +79,11 @@ static cl::opt<LoopVectorizeHints::ScalableForceKind>
"Scalable vectorization is available and favored when the "
"cost is inconclusive.")));
+static cl::opt<bool>
+ EnableCompactVectorization("enable-compact-vectorization", cl::init(true),
+ cl::Hidden,
+ cl::desc("Enable vectorizing compact pattern."));
+
/// Maximum vectorization interleave count.
static const unsigned MaxInterleaveFactor = 16;
@@ -785,6 +791,143 @@ static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) {
return Scalarize;
}
+static bool isUserOfCompactPHI(BasicBlock *BB, PHINode *Phi, Instruction *I) {
+ if (I->getParent() != BB)
+ return false;
+
+ // Operations on PHI should be affine.
+ if (I->getOpcode() != Instruction::Add &&
+ I->getOpcode() != Instruction::Sub &&
+ I->getOpcode() != Instruction::SExt &&
+ I->getOpcode() != Instruction::ZExt)
+ return false;
+
+ if (I == Phi)
+ return true;
+
+ for (unsigned i = 0; i < I->getNumOperands(); i++) {
+ if (auto *Instr = dyn_cast<Instruction>(I->getOperand(i)))
+ if (isUserOfCompactPHI(BB, Phi, Instr))
+ return true;
+ }
+ return false;
+}
+
+// Match the basic compact pattern:
+// for.body:
+// %src.phi = phi i64 [ 0, %preheader ], [ %target.phi, %for.inc ]
+// ...
+// if.then:
+// ...
+// %data = load i32, ptr %In
+// (there may be additional sext/zext if %src.phi types i32)
+// %addr = getelementptr i32, ptr %Out, i64 %src.phi
+// store i32 %data, ptr %addr
+// %inc = add i64 %src.phi, 1
+// for.inc
+// %target.phi = phi i64 [ %inc, if.then ], [ %src.phi, %for.body ]
+bool LoopVectorizationLegality::isMatchCompact(PHINode *Phi, Loop *TheLoop,
+ CompactDescriptor &CpDesc) {
+ if (Phi->getNumIncomingValues() > 2)
+ return false;
+
+ // Don't support phis who is used as mask.
+ for (User *U : Phi->users()) {
+ if (isa<CmpInst>(U))
+ return false;
+ }
+
+ SmallPtrSet<Value *, 8> CompactChain;
+ CompactChain.insert(Phi);
+
+ BasicBlock *LoopPreHeader = TheLoop->getLoopPreheader();
+ int ExitIndex = Phi->getIncomingBlock(0) == LoopPreHeader ? 1 : 0;
+ BasicBlock *ExitBlock = Phi->getIncomingBlock(ExitIndex);
+ PHINode *CompactLiveOut = nullptr;
+ Value *IncValue = nullptr;
+ BasicBlock *IncBlock = nullptr;
+ bool IsCycle = false;
+ for (auto &CandPhi : ExitBlock->phis()) {
+ if (llvm::is_contained(CandPhi.incoming_values(), Phi) &&
+ CandPhi.getNumIncomingValues() == 2) {
+ IsCycle = true;
+ CompactLiveOut = &CandPhi;
+ int IncIndex = CandPhi.getIncomingBlock(0) == Phi->getParent() ? 1 : 0;
+ IncBlock = CandPhi.getIncomingBlock(IncIndex);
+ IncValue = CandPhi.getIncomingValueForBlock(IncBlock);
+ break;
+ }
+ }
+ // Similar with reduction PHI.
+ if (!IsCycle)
+ return false;
+ CompactChain.insert(CompactLiveOut);
+
+ // Match the pattern %inc = add i32 %src.phi, 1.
+ Value *Index = nullptr, *Step = nullptr;
+ if (!match(IncValue, m_Add(m_Value(Index), m_Value(Step))))
+ return false;
+ if (Index != Phi) {
+ std::swap(Index, Step);
+ }
+ if (Step != ConstantInt::get(Step->getType(), 1))
+ return false;
+ CompactChain.insert(IncValue);
+
+ const DataLayout &DL = Phi->getModule()->getDataLayout();
+ int CntCandStores = 0;
+ GetElementPtrInst *GEP = nullptr;
+ for (auto &Inst : *IncBlock) {
+ if (auto *SI = dyn_cast<StoreInst>(&Inst)) {
+ // TODO: Support llvm.sve.compact.nxv8i16, llvm.sve.compact.nxv16i18 in
+ // the future.
+ unsigned TySize = DL.getTypeSizeInBits(SI->getValueOperand()->getType());
+ if (TySize < 32)
+ return false;
+
+ GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand());
+ if (GEP == nullptr)
+ continue;
+
+ // Only handle single pointer.
+ if (GEP->getNumOperands() != 2)
+ continue;
+
+ // Get the index of GEP, index could be phi or sext/zext (if phi types
+ // i32).
+ Value *Op1 = GEP->getOperand(1);
+ Value *X = nullptr;
+ SmallSet<Value *, 16> CandiInstrs;
+ if (match(Op1, m_SExt(m_Value(X))) || match(Op1, m_ZExt(m_Value(X)))) {
+ Op1 = X;
+ }
+ Instruction *Op1Instr = dyn_cast<Instruction>(Op1);
+ if (!Op1Instr || isUserOfCompactPHI(IncBlock, Phi, Op1Instr))
+ continue;
+ CompactChain.insert(GEP);
+ CompactChain.insert(SI);
+ CntCandStores++;
+ }
+ }
+ if (!CntCandStores)
+ return false;
+
+ KnownBits Bits = computeKnownBits(Phi, DL);
+ bool IsSign = !Bits.isNonNegative();
+ CompactDescriptor CompactDesc(CompactChain, CompactLiveOut, IsSign);
+ CpDesc = CompactDesc;
+ LLVM_DEBUG(dbgs() << "LV: Found a compact chain.\n");
+ return true;
+}
+
+PHINode *LoopVectorizationLegality::getCompactChainStart(Instruction *I) const {
+ for (auto &CpDesc : CpList) {
+ if (CpDesc.second.isInCompactChain(I))
+ return CpDesc.first;
+ }
+ return nullptr;
+}
+
bool LoopVectorizationLegality::canVectorizeInstrs() {
BasicBlock *Header = TheLoop->getHeader();
@@ -881,6 +1024,14 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
continue;
}
+ CompactDescriptor CpDesc;
+ if (EnableCompactVectorization &&
+ TTI->isTargetSupportedCompactStore() &&
+ isMatchCompact(Phi, TheLoop, CpDesc)) {
+ CpList[Phi] = CpDesc;
+ continue;
+ }
+
reportVectorizationFailure("Found an unidentified PHI",
"value that could not be identified as "
"reduction is used outside the loop",
@@ -1525,16 +1676,22 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
SmallPtrSet<const Value *, 8> ReductionLiveOuts;
+ SmallPtrSet<const Value *, 8> CompactLiveOuts;
for (const auto &Reduction : getReductionVars())
ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
+ for (const auto &Compact : getCompactList())
+ CompactLiveOuts.insert(Compact.second.getLiveOutPhi());
+
// TODO: handle non-reduction outside users when tail is folded by masking.
for (auto *AE : AllowedExit) {
// Check that all users of allowed exit values are inside the loop or
// are the live-out of a reduction.
if (ReductionLiveOuts.count(AE))
continue;
+ if (CompactLiveOuts.count(AE))
+ continue;
for (User *U : AE->users()) {
Instruction *UI = cast<Instruction>(U);
if (TheLoop->contains(UI))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 88f064b6d57cebc..28d3cadc0cf392d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -619,6 +619,8 @@ class InnerLoopVectorizer {
/// Create code for the loop exit value of the reduction.
void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
+ void fixCompactPHI(VPCompactPHIRecipe *CompactPHIR, VPTransformState &State);
+
/// Iteratively sink the scalarized operands of a predicated instruction into
/// the block that was created for it.
void sinkScalarOperands(Instruction *PredInst);
@@ -1968,7 +1970,8 @@ class GeneratedRTChecks {
/// there is no vector code generation, the check blocks are removed
/// completely.
void Create(Loop *L, const LoopAccessInfo &LAI,
- const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
+ const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC,
+ LoopVectorizationLegality *LVL = nullptr) {
// Hard cutoff to limit compile-time increase in case a very large number of
// runtime checks needs to be generated.
@@ -2001,7 +2004,7 @@ class GeneratedRTChecks {
"vector.memcheck");
auto DiffChecks = RtPtrChecking.getDiffChecks();
- if (DiffChecks) {
+ if (DiffChecks && !(LVL && LVL->hasCompactChain())) {
Value *RuntimeVF = nullptr;
MemRuntimeCheckCond = addDiffRuntimeChecks(
MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
@@ -3654,6 +3657,47 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
for (VPRecipeBase &R : Header->phis()) {
if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
fixFixedOrderRecurrence(FOR, State);
+ else if (auto *CompactR = dyn_cast<VPCompactPHIRecipe>(&R))
+ fixCompactPHI(CompactR, State);
+ }
+}
+
+void InnerLoopVectorizer::fixCompactPHI(VPCompactPHIRecipe *CompactPHIR,
+ VPTransformState &State) {
+ Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstNonPHI());
+ VPValue *VPBackEdgeValue = CompactPHIR->getBackedgeValue();
+ Value *BackEdgeValue = State.get(VPBackEdgeValue, State.UF - 1);
+ Value *StartValue = CompactPHIR->getStartValue()->getUnderlyingValue();
+ Value *TruncBackEdgeValue = BackEdgeValue;
+ if (StartValue->getType() != BackEdgeValue->getType())
+ TruncBackEdgeValue =
+ Builder.CreateTruncOrBitCast(BackEdgeValue, StartValue->getType());
+
+ // Generate phi in scalar preheader to pass LiveIns outside the loop.
+ PHINode *ScalarPreheaderPN =
+ PHINode::Create(StartValue->getType(), 2, "compact.rdx",
+ LoopScalarPreHeader->getFirstNonPHI());
+
+ for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
+ if (Incoming == LoopMiddleBlock)
+ ScalarPreheaderPN->addIncoming(TruncBackEdgeValue, Incoming);
+ else
+ ScalarPreheaderPN->addIncoming(StartValue, Incoming);
+ }
+
+ Value *ScalarBackEdgeValue =
+ CompactPHIR->getBackedgeValue()->getUnderlyingValue();
+ for (PHINode &Phi : LoopScalarBody->phis()) {
+ if (llvm::is_contained(Phi.incoming_values(), ScalarBackEdgeValue)) {
+ Phi.setIncomingValueForBlock(LoopScalarPreHeader, ScalarPreheaderPN);
+ }
+ }
+
+ for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
+ if (llvm::is_contained(LCSSAPhi.incoming_values(), ScalarBackEdgeValue)) {
+ LCSSAPhi.addIncoming(TruncBackEdgeValue, LoopMiddleBlock);
+ State.Plan->removeLiveOut(&LCSSAPhi);
+ }
}
}
@@ -4260,6 +4304,8 @@ bool LoopVectorizationCostModel::isScalarWithPredication(
.Kind == CM_Scalarize;
case Instruction::Load:
case Instruction::Store: {
+ if (Legal->getCompactChainStart(I) != nullptr)
+ return false;
auto *Ptr = getLoadStorePointerOperand(I);
auto *Ty = getLoadStoreType(I);
Type *VTy = Ty;
@@ -4621,6 +4667,12 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
continue;
}
+ // GEPs in compact chain should be uniform after vectorization.
+ if (isa<GetElementPtrInst>(&I) && Legal->getCompactChainStart(&I)) {
+ addToWorklistIfAllowed(&I);
+ continue;
+ }
+
// If there's no pointer operand, there's nothing to do.
auto *Ptr = getLoadStorePointerOperand(&I);
if (!Ptr)
@@ -6844,6 +6896,24 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
continue;
}
+ if (isa<StoreInst>(I) && Legal->hasCompactChain()) {
+ InstructionCost Cost = 0;
+ if (!VF.isScalable() || VF.isScalar()) {
+ setWideningDecision(&I, VF, CM_Widen, InstructionCost::getInvalid());
+ continue;
+ }
+ Type *EleTy = getLoadStoreType(&I);
+ VectorType *VectorTy = cast<VectorType>(ToVectorTy(EleTy, VF));
+ const Align Alignment = getLoadStoreAlignment(&I);
+ unsigned AS = getLoadStoreAddressSpace(&I);
+ enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ Cost += TTI.getMaskedMemoryOpCost(I.getOpcode(), VectorTy, Alignment,
+ AS, CostKind);
+ Cost += TTI.getCompactCost();
+ setWideningDecision(&I, VF, CM_Widen, Cost);
+ continue;
+ }
+
// Choose between Interleaving, Gather/Scatter or Scalarization.
InstructionCost InterleaveCost = InstructionCost::getInvalid();
unsigned NumAccesses = 1;
@@ -8590,6 +8660,30 @@ VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,
return toVPRecipeResult(Recipe);
}
+VPRecipeOrVPValueTy VPRecipeBuilder::tryToCreateWidenCompactRecipe(
+ Instruction *Instr, ArrayRef<VPValue *> Operands, VPlanPtr &Plan,
+ bool IsSign, const TargetTransformInfo *TTI) {
+ if (auto Phi = dyn_cast<PHINode>(Instr)) {
+ if (Instr->getParent() != OrigLoop->getHeader())
+ return toVPRecipeResult(new VPWidenCompactInstructionRecipe(
+ Instr, Instr->getOpcode(), Operands));
+
+ VPValue *StartV = Operands[0];
+ VPHeaderPHIRecipe *PhiRecipe = new VPCompactPHIRecipe(Phi, StartV, IsSign);
+ recordRecipeOf(cast<Instruction>(
+ Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
+ PhisToFix.push_back(PhiRecipe);
+ return toVPRecipeResult(PhiRecipe);
+ }
+
+ if (isa<GetElementPtrInst>(Instr))
+ return nullptr;
+
+ VPValue *Mask = createBlockInMask(Instr->getParent(), *Plan);
+ return toVPRecipeResult(new VPWidenCompactInstructionRecipe(
+ Instr, Instr->getOpcode(), Operands, Mask, TTI));
+}
+
VPRecipeOrVPValueTy
VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
ArrayRef<VPValue *> Operands,
@@ -8681,6 +8775,10 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
ElementCount MaxVF) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
+ // Don't build vplan of fixed width version if there is a compact chain in the
+ // loop.
+ if (Legal->hasCompactChain() && !MinVF.isScalable())
+ return;
auto MaxVFTimes2 = MaxVF * 2;
for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
@@ -8853,8 +8951,15 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
continue;
- auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
- Instr, Operands, Range, VPBB, Plan);
+ VPRecipeOrVPValueTy RecipeOrValue;
+ if (PHINode *ChainStart = Legal->getCompactChainStart(Instr)) {
+ RecipeOrValue = RecipeBuilder.tryToCreateWidenCompactRecipe(
+ Instr, Operands, Plan, Legal->isSign(ChainStart), &TTI);
+ } else {
+ RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands,
+ Range, VPBB, Plan);
+ }
+
if (!RecipeOrValue)
RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan);
// If Instr can be simplified to an existing VPValue, use it.
@@ -10028,7 +10133,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Optimistically generate runtime checks if they are needed. Drop them if
// they turn out to not be profitable.
if (VF.Width.isVector() || SelectedIC > 1)
- Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
+ Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC,
+ &LVL);
// Check if it is profitable to vectorize with runtime checks.
bool ForceVectorization =
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 7ff6749a09089e9..0449b678b714072 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -123,6 +123,11 @@ class VPRecipeBuilder {
VFRange &Range, VPBasicBlock *VPBB,
VPlanPtr &Plan);
+ VPRecipeOrVPValueTy
+ tryToCreateWidenCompactRecipe(Instruction *Instr,
+ ArrayRef<VPValue *> Operands, VPlanPtr &Plan,
+ bool IsSign, const TargetTransformInfo *TTI);
+
/// Set the recipe created for given ingredient. This operation is a no-op for
/// ingredients that were not marked using a nullptr entry in the map.
void setRecipe(Instruction *I, VPRecipeBase *R) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index b4d464ad7ad8c7c..39c0f114a79cfef 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -818,6 +818,7 @@ void VPlan::execute(VPTransformState *State) {
// generated.
bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) ||
+ isa<VPCompactPHIRecipe>(PhiR) ||
(isa<VPReductionPHIRecipe>(PhiR) &&
cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
@@ -828,6 +829,22 @@ void VPlan::execute(VPTransformState *State) {
SinglePartNeeded ? State->UF - 1 : Part);
cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
}
+
+ // Fix Compact phis if UF > 1.
+ if (isa<VPCompactPHIRecipe>(PhiR)) {
+ for (unsigned Part = 1; Part < State->UF; ++Part) {
+ Value *Val = State->get(PhiR->getBackedgeValue(), Part - 1);
+ // BOSCC vectorization will transform liveouts into phis, and we should
+ // get the underlying value here.
+ if (auto *PN = dyn_cast<PHINode>(Val)) {
+ int ValIdx = isa<PoisonValue>(PN->getOperand(0)) ? 1 : 0;
+ Val = PN->getOperand(ValIdx);
+ }
+ PHINode *Phi = cast<PHINode>(State->get(PhiR, Part));
+ Phi->replaceAllUsesWith(Val);
+ Phi->eraseFromParent();
+ }
+ }
}
// We do not attempt to preserve DT for outer loop vectorization currently.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e65a7ab2cd028ee..e04266b7f5f1856 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1274,6 +1274,40 @@ struct VPWidenSelectRecipe : public VPRecipeBase, public VPValue {
}
};
+class VPWidenCompactInstructionRecipe : public VPRecipeBase, public VPValue {
+private:
+ Instruction &Ingredient;
+ unsigned Opcode;
+ VPValue *Mask;
+ const TargetTransformInfo *TTI;
+
+ void genCompactInc(VPTransformState &State);
+ void genCompactStore(VPTransformState &State);
+ void genCompactLiveOut(VPTransformState &State);
+
+public:
+ VPWidenCompactInstructionRecipe(Instruction *I, unsigned Opcode,
+ ArrayRef<VPValue *> Operands,
+ VPValue *Mask = nullptr,
+ const TargetTransformInfo *TTI = nullptr)
+ : VPRecipeBase(VPDef::VPCompactInstructionSC, Operands), VPValue(this, I),
+ Ingredient(*I), Opcode(Opcode), Mask(Mask), TTI(TTI) {}
+ ~VPWidenCompactInstructionRecipe() override = default;
+
+ VP_CLASSOF_IMPL(VPDef::VPCompactInstructionSC)
+
+ unsigned getOpcode() const { return Opcode; }
+
+ VPValue *getMask() { return Mask; }
+
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// A recipe for handling GEP instructions.
class VPWidenGEPRecipe : public VPRecipeWithIRFlags, public VPValue {
bool isPointerLoopInvariant() const {
@@ -1616,6 +1650,34 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
bool isInLoop() const { return IsInLoop; }
};
+class VPCompactPHIRecipe : public VPHeaderPHIRecipe {
+ PHINode *CompactPHI;
+ bool IsCompactSign;
+
+public:
+ VPCompactPHIRecipe(PHINode *Phi, VPValue *Start, bool IsSign)
+ : VPHeaderPHIRecipe(VPDef::VPCompactPHISC, Phi, Start), CompactPHI(Phi),
+ IsCompactSign(IsSign) {}
+
+ ~VPCompactPHIRecipe() override = default;
+
+ VP_CLASSOF_IMPL(VPDef::VPCompactPHISC)
+
+ static inline bool classof(const VPHeaderPHIRecipe *R) {
+ return R->getVPDefID() == VPDef::VPCompactPHISC;
+ }
+
+ bool isSign() { return IsCompactSign; }
+
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// A recipe for vectorizing a phi-node as a sequence of mask-based select
/// instructions.
class VPBlendRecipe : public VPRecipeBase, public VPValue {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2a1213a98095907..06237f779415c33 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -27,6 +27,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
#include <cassert>
@@ -945,6 +946,140 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
VecInd->addIncoming(LastInduction, VectorPH);
}
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenCompactInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "COMPACT ";
+ if (getOpcode() != Instruction::Store) {
+ printAsOperand(O, SlotTracker);
+ O << " = ";
+ }
+ O << Instruction::getOpcodeName(getOpcode()) << " ";
+ printOperands(O, SlotTracker);
+}
+#endif
+
+void VPWidenCompactInstructionRecipe::execute(VPTransformState &State) {
+ switch (getOpcode()) {
+ case Instruction::Add:
+ genCompactInc(State);
+ break;
+ case Instruction::PHI:
+ genCompactLiveOut(State);
+ break;
+ case Instruction::Store:
+ genCompactStore(State);
+ break;
+ default:
+ llvm_unreachable("Unsupport opcode for compact.");
+ }
+}
+
+void VPWidenCompactInstructionRecipe::genCompactStore(VPTransformState &State) {
+ assert(State.VF.isScalable() && "Compact store is for SVE scenario");
+ auto &Builder = State.Builder;
+ VPValue *VPStoredValue = getOperand(0);
+ VPValue *VPAddr = getOperand(1);
+ StoreInst *SI = cast<StoreInst>(&Ingredient);
+ Type *ScalarTy = getLoadStoreType(&Ingredient);
+ Module *M = SI->getModule();
+ VectorType *MaskVTy = cast<VectorType>(State.get(getMask(), 0)->getType());
+ Constant *One = nullptr;
+ unsigned VL = MaskVTy->getElementCount().getKnownMinValue();
+ switch (VL) {
+ case 2:
+ One = ConstantInt::get(Type::getInt64Ty(M->getContext()), 1);
+ break;
+ case 4:
+ One = ConstantInt::get(Type::getInt32Ty(M->getContext()), 1);
+ break;
+ default:
+ // TODO: Try to support compact.nxv8i16 / compact.nxv16i8 in the future.
+ llvm_unreachable("Unsupported type");
+ }
+ Constant *VOne = ConstantVector::getSplat(MaskVTy->getElementCount(), One);
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ // Generate compact mask.
+ Value *Mask = State.get(getMask(), Part);
+ Value *CompactMaskII = createTargetCompact(Builder, M, TTI, Mask, VOne);
+ assert(CompactMaskII && "Do not support compact in current target.");
+ Value *CompactCmpII =
+ Builder.CreateCmp(ICmpInst::ICMP_EQ, CompactMaskII, VOne);
+
+ // Transform stored value into compact form.
+ VectorType *StoreVTy = VectorType::get(ScalarTy, State.VF);
+ const Align Alignment = getLoadStoreAlignment(&Ingredient);
+ Value *Addr = State.get(VPAddr, VPIteration(Part, 0));
+ Value *StoredValue = State.get(VPStoredValue, Part);
+ Value *SCompact = createTargetCompact(Builder, M, TTI, Mask, StoredValue);
+ assert(SCompact && "Do not support comapct in current target.");
+ Instruction *CompactSI =
+ Builder.CreateMaskedStore(SCompact, Addr, Alignment, CompactCmpII);
+ State.addMetadata(CompactSI, SI);
+ }
+}
+
+void VPWidenCompactInstructionRecipe::genCompactInc(VPTransformState &State) {
+ auto &Builder = State.Builder;
+ Module *M = getUnderlyingInstr()->getModule();
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *Mask = State.get(getMask(), Part);
+ Constant *PTrue = ConstantInt::getTrue(cast<VectorType>(Mask->getType()));
+ Value *CNTPCall = createTargetCNTP(Builder, M, TTI, PTrue, Mask);
+ Value *Idx = nullptr;
+ if (Part == 0)
+ Idx = State.get(getOperand(0), Part);
+ else
+ Idx = State.get(this, Part - 1);
+ Value *TruncCall = CNTPCall;
+ if (Idx->getType() != CNTPCall->getType()) {
+ TruncCall = Builder.CreateTrunc(CNTPCall, Idx->getType());
+ }
+ Value *NewInc = Builder.CreateAdd(cast<Instruction>(Idx), TruncCall);
+ State.set(this, NewInc, Part);
+ }
+}
+
+void VPWidenCompactInstructionRecipe::genCompactLiveOut(
+ VPTransformState &State) {
+ // Get the exit value of phi
+ VPValue *VPExitValue = nullptr;
+ PHINode *Phi = cast<PHINode>(&Ingredient);
+ for (unsigned Idx = 0; Idx < Phi->getNumIncomingValues(); Idx++) {
+ PHINode *PhiOp =
+ dyn_cast_or_null<PHINode>(getOperand(Idx)->getUnderlyingValue());
+ if (!PhiOp) {
+ VPExitValue = getOperand(Idx);
+ break;
+ }
+ }
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *ExitVal = State.get(VPExitValue, Part);
+ State.set(this, ExitVal, Part);
+ }
+}
+
+void VPCompactPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "COMPACT-PHI ";
+ printAsOperand(O, SlotTracker);
+ O << " = phi ";
+ printOperands(O, SlotTracker);
+}
+
+void VPCompactPHIRecipe::execute(VPTransformState &State) {
+ BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+ BasicBlock *VectorHeader = State.CFG.PrevBB;
+ VPValue *StartVPV = getStartValue();
+ Value *Start = StartVPV->getLiveInIRValue();
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ PHINode *Entry = PHINode::Create(Start->getType(), 2, "compact.iv",
+ &*VectorHeader->getFirstInsertionPt());
+ Entry->addIncoming(Start, VectorPH);
+ State.set(this, Entry, Part);
+ }
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index ac110bb3b0ef9be..3d5cc8f9ea4620e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -348,6 +348,7 @@ class VPDef {
VPWidenCastSC,
VPWidenGEPSC,
VPWidenMemoryInstructionSC,
+ VPCompactInstructionSC,
VPWidenSC,
VPWidenSelectSC,
// START: Phi-like recipes. Need to be kept together.
@@ -361,6 +362,7 @@ class VPDef {
VPWidenPHISC,
VPWidenIntOrFpInductionSC,
VPWidenPointerInductionSC,
+ VPCompactPHISC,
VPReductionPHISC,
// END: SubclassID for recipes that inherit VPHeaderPHIRecipe
// END: Phi-like recipes
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/compact-vplan.ll b/llvm/test/Transforms/LoopVectorize/AArch64/compact-vplan.ll
new file mode 100644
index 000000000000000..d8d3ebdaacd42ba
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/compact-vplan.ll
@@ -0,0 +1,78 @@
+; REQUIRES: asserts
+; RUN: opt -passes=loop-vectorize -debug -disable-output %s 2>&1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; CHECK-LABEL: 'kernel_reference'
+; CHECK: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
+; CHECK-NEXT: Live-in vp<%0> = vector-trip-count
+; CHECK-NEXT: vp<%1> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ph:
+; CHECK-NEXT: EMIT vp<%1> = EXPAND SCEV (zext i32 %N to i64)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION
+; CHECK-NEXT: COMPACT-PHI ir<%n.013> = phi ir<0>, ir<%n.1>
+; CHECK-NEXT: vp<%4> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%comp>, vp<%4>
+; CHECK-NEXT: WIDEN ir<%0> = load ir<%arrayidx>
+; CHECK-NEXT: WIDEN ir<%cmp1> = icmp slt ir<%0>, ir<%a>
+; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%B>, vp<%4>
+; CHECK-NEXT: WIDEN ir<%1> = load ir<%arrayidx3>, ir<%cmp1>
+; CHECK-NEXT: COMPACT ir<%inc> = add ir<%n.013>, ir<1>
+; CHECK-NEXT: CLONE ir<%idxprom4> = sext ir<%n.013>
+; CHECK-NEXT: CLONE ir<%arrayidx5> = getelementptr inbounds ir<%Out_ref>, ir<%idxprom4>
+; CHECK-NEXT: COMPACT store ir<%1>, ir<%arrayidx5>
+; CHECK-NEXT: COMPACT ir<%n.1> = phi ir<%inc>, ir<%n.013>
+; CHECK-NEXT: EMIT vp<%15> = VF * UF + nuw vp<%2>
+; CHECK-NEXT: EMIT branch-on-count vp<%15>, vp<%0>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+;
+; Function Attrs: argmemonly nofree norecurse nosync nounwind uwtable vscale_range(1,16)
+define dso_local i32 @kernel_reference(i32 noundef %N, i32 noundef %a, ptr noalias nocapture noundef readonly %comp, ptr noalias nocapture noundef writeonly %Out_ref, ptr nocapture noundef readonly %B, ptr noalias nocapture noundef readnone %Out1) #0 {
+entry:
+ %cmp11 = icmp sgt i32 %N, 0
+ br i1 %cmp11, label %for.body.preheader, label %for.end
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %n.013 = phi i32 [ 0, %for.body.preheader ], [ %n.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i32, ptr %comp, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %cmp1 = icmp slt i32 %0, %a
+ br i1 %cmp1, label %if.then, label %for.inc
+
+if.then: ; preds = %for.body
+ %arrayidx3 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+ %1 = load i32, ptr %arrayidx3, align 4
+ %inc = add nsw i32 %n.013, 1
+ %idxprom4 = sext i32 %n.013 to i64
+ %arrayidx5 = getelementptr inbounds i32, ptr %Out_ref, i64 %idxprom4
+ store i32 %1, ptr %arrayidx5, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body, %if.then
+ %n.1 = phi i32 [ %inc, %if.then ], [ %n.013, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end: ; preds = %for.inc, %entry
+ %n.0.lcssa = phi i32 [ 0, %entry ], [ %n.1, %for.inc ]
+ ret i32 %n.0.lcssa
+}
+
+attributes #0 = { argmemonly nofree norecurse nosync nounwind uwtable vscale_range(1,16) "target-cpu"="generic" "target-features"="+neon,+sve,+v8.2a"}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/compact.ll b/llvm/test/Transforms/LoopVectorize/AArch64/compact.ll
index 0bafd325bb8e98e..419ba76eb4cd4ac 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/compact.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/compact.ll
@@ -4,6 +4,12 @@
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-gnu"
+; for (i = 0; i < N; i++){
+; x = comp[i];
+; if(x < a) Out[n++] = B[i];
+; }
+; return n;
+
; Function Attrs: argmemonly nofree norecurse nosync nounwind uwtable vscale_range(1,16)
define dso_local i32 @kernel_reference(i32 noundef %N, i32 noundef %a, ptr noalias nocapture noundef readonly %comp, ptr noalias nocapture noundef writeonly %Out_ref, ptr nocapture noundef readonly %B, ptr noalias nocapture noundef readnone %Out1) #0 {
; CHECK-LABEL: @kernel_reference(
@@ -12,29 +18,97 @@ define dso_local i32 @kernel_reference(i32 noundef %N, i32 noundef %a, ptr noali
; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
; CHECK: for.body.preheader:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A:%.*]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[COMPACT_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
+; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[COMP:%.*]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[COMP]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP12]], align 4
+; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP14]]
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i32>, ptr [[TMP15]], align 4
+; CHECK-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD2]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP18]], i32 0
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP20]], i32 4, <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> poison)
+; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP18]], i64 [[TMP22]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP23]], i32 4, <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i32> poison)
+; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.aarch64.sve.cntp.nxv4i1(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> [[TMP16]])
+; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
+; CHECK-NEXT: [[TMP26:%.*]] = add i32 [[COMPACT_IV]], [[TMP25]]
+; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.aarch64.sve.cntp.nxv4i1(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> [[TMP17]])
+; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT: [[TMP29]] = add i32 [[TMP26]], [[TMP28]]
+; CHECK-NEXT: [[TMP30:%.*]] = sext i32 [[COMPACT_IV]] to i64
+; CHECK-NEXT: [[TMP31:%.*]] = sext i32 [[TMP26]] to i64
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[OUT_REF:%.*]], i64 [[TMP30]]
+; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[OUT_REF]], i64 [[TMP31]]
+; CHECK-NEXT: [[TMP34:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.compact.nxv4i32(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT: [[TMP35:%.*]] = icmp eq <vscale x 4 x i32> [[TMP34]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP36:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.compact.nxv4i32(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]])
+; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP36]], ptr [[TMP32]], i32 4, <vscale x 4 x i1> [[TMP35]])
+; CHECK-NEXT: [[TMP37:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.compact.nxv4i32(<vscale x 4 x i1> [[TMP17]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT: [[TMP38:%.*]] = icmp eq <vscale x 4 x i32> [[TMP37]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP39:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.compact.nxv4i32(<vscale x 4 x i1> [[TMP17]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD3]])
+; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP39]], ptr [[TMP33]], i32 4, <vscale x 4 x i1> [[TMP38]])
+; CHECK-NEXT: [[TMP40:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP40]], 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP41]]
+; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[COMPACT_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT: [[N_013:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_1:%.*]], [[FOR_INC]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COMP:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[A:%.*]]
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT: [[N_013:%.*]] = phi i32 [ [[COMPACT_RDX]], [[SCALAR_PH]] ], [ [[N_1:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COMP]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP43]], [[A]]
; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; CHECK: if.then:
-; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP44:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[N_013]], 1
; CHECK-NEXT: [[IDXPROM4:%.*]] = sext i32 [[N_013]] to i64
-; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[OUT_REF:%.*]], i64 [[IDXPROM4]]
-; CHECK-NEXT: store i32 [[TMP1]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[OUT_REF]], i64 [[IDXPROM4]]
+; CHECK-NEXT: store i32 [[TMP44]], ptr [[ARRAYIDX5]], align 4
; CHECK-NEXT: br label [[FOR_INC]]
; CHECK: for.inc:
; CHECK-NEXT: [[N_1]] = phi i32 [ [[INC]], [[IF_THEN]] ], [ [[N_013]], [[FOR_BODY]] ]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: for.end.loopexit:
-; CHECK-NEXT: [[N_1_LCSSA:%.*]] = phi i32 [ [[N_1]], [[FOR_INC]] ]
+; CHECK-NEXT: [[N_1_LCSSA:%.*]] = phi i32 [ [[N_1]], [[FOR_INC]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[FOR_END]]
; CHECK: for.end:
; CHECK-NEXT: [[N_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[N_1_LCSSA]], [[FOR_END_LOOPEXIT]] ]
More information about the llvm-commits
mailing list