[llvm] ade47bd - [LV] Improve register pressure estimate at high VFs
Peter Waller via llvm-commits
llvm-commits at lists.llvm.org
Mon May 23 01:01:47 PDT 2022
Author: Peter Waller
Date: 2022-05-23T07:57:45Z
New Revision: ade47bdc317bda57fe0f2a741e5bf271cfc44054
URL: https://github.com/llvm/llvm-project/commit/ade47bdc317bda57fe0f2a741e5bf271cfc44054
DIFF: https://github.com/llvm/llvm-project/commit/ade47bdc317bda57fe0f2a741e5bf271cfc44054.diff
LOG: [LV] Improve register pressure estimate at high VFs
Previously, `getRegUsageForType` was implemented using
`getTypeLegalizationCost`. `getRegUsageForType` is used by the loop
vectorizer to estimate the register pressure caused by using a vector
type. However, `getTypeLegalizationCost` currently only appears to
understand splitting and not scalarization, so significantly
underestimates the register requirements.
Instead, use `getNumRegisters`, which understands when scalarization
can occur (via computeRegisterProperties).
This was discovered while investigating D118979 (Set maximum VF with
shouldMaximizeVectorBandwidth), where under fixed-length 512-bit SVE the
loop vectorizer previously ends up costing an v128i1 as 2 v64i*
registers where it actually occupies 128 i32 registers.
I'm sending this patch early for comment, I'm still doing some sanity checking
with LNT. I note that getRegisterClassForType appears to return VectorRC even
though the type in question (large vNi1 types) end up occupying scalar
registers. That might be worth fixing too.
Differential Revision: https://reviews.llvm.org/D125918
Added:
llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll
llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll
Modified:
llvm/include/llvm/Analysis/TargetTransformInfo.h
llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
llvm/include/llvm/CodeGen/BasicTTIImpl.h
llvm/lib/Analysis/TargetTransformInfo.cpp
llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Removed:
################################################################################
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 376e1d13cdc10..4704697b40d2e 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -730,7 +730,7 @@ class TargetTransformInfo {
bool isTypeLegal(Type *Ty) const;
/// Returns the estimated number of registers required to represent \p Ty.
- InstructionCost getRegUsageForType(Type *Ty) const;
+ unsigned getRegUsageForType(Type *Ty) const;
/// Return true if switches should be turned into lookup tables for the
/// target.
@@ -1593,7 +1593,7 @@ class TargetTransformInfo::Concept {
virtual bool isProfitableToHoist(Instruction *I) = 0;
virtual bool useAA() = 0;
virtual bool isTypeLegal(Type *Ty) = 0;
- virtual InstructionCost getRegUsageForType(Type *Ty) = 0;
+ virtual unsigned getRegUsageForType(Type *Ty) = 0;
virtual bool shouldBuildLookupTables() = 0;
virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
virtual bool shouldBuildRelLookupTables() = 0;
@@ -2032,7 +2032,7 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
}
bool useAA() override { return Impl.useAA(); }
bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); }
- InstructionCost getRegUsageForType(Type *Ty) override {
+ unsigned getRegUsageForType(Type *Ty) override {
return Impl.getRegUsageForType(Ty);
}
bool shouldBuildLookupTables() override {
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index fe937adde411b..383b91c7af13d 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -312,7 +312,7 @@ class TargetTransformInfoImplBase {
bool isTypeLegal(Type *Ty) const { return false; }
- InstructionCost getRegUsageForType(Type *Ty) const { return 1; }
+ unsigned getRegUsageForType(Type *Ty) const { return 1; }
bool shouldBuildLookupTables() const { return true; }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index a2b9860c46e6c..c1b9533b6fc1e 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -382,10 +382,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return getTLI()->isTypeLegal(VT);
}
- InstructionCost getRegUsageForType(Type *Ty) {
- InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first;
- assert(Val >= 0 && "Negative cost!");
- return Val;
+ unsigned getRegUsageForType(Type *Ty) {
+ EVT ETy = getTLI()->getValueType(DL, Ty);
+ return getTLI()->getNumRegisters(Ty->getContext(), ETy);
}
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 384d95a5b70e5..3e4cacf17a40d 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -473,7 +473,7 @@ bool TargetTransformInfo::isTypeLegal(Type *Ty) const {
return TTIImpl->isTypeLegal(Ty);
}
-InstructionCost TargetTransformInfo::getRegUsageForType(Type *Ty) const {
+unsigned TargetTransformInfo::getRegUsageForType(Type *Ty) const {
return TTIImpl->getRegUsageForType(Ty);
}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 435d192d4912f..1822640c74bae 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -429,7 +429,7 @@ void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
BaseT::getPeelingPreferences(L, SE, PP);
}
-InstructionCost RISCVTTIImpl::getRegUsageForType(Type *Ty) {
+unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
TypeSize Size = Ty->getPrimitiveSizeInBits();
if (Ty->isVectorTy()) {
if (Size.isScalable() && ST->hasVInstructions())
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index cdec074dc2a46..bab157dbfef0c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -60,7 +60,7 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
- InstructionCost getRegUsageForType(Type *Ty);
+ unsigned getRegUsageForType(Type *Ty);
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
Align Alignment, unsigned AddressSpace,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f24622ee620a2..6a4159b06d577 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5987,16 +5987,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
- // A lambda that gets the register usage for the given type and VF.
- const auto &TTICapture = TTI;
- auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
+ auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned {
if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
return 0;
- InstructionCost::CostType RegUsage =
- *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
- assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
- "Nonsensical values for register usage.");
- return RegUsage;
+ return TTI.getRegUsageForType(VectorType::get(Ty, VF));
};
for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll
new file mode 100644
index 0000000000000..f0dc8e502769d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll
@@ -0,0 +1,57 @@
+; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 < %s | FileCheck %s
+; REQUIRES: asserts
+
+target triple = "aarch64"
+
+; Test that shows how many registers the loop vectorizer thinks an illegal <VF x i1> will consume.
+
+; CHECK-LABEL: LV: Checking a loop in 'or_reduction_neon' from <stdin>
+; CHECK: LV(REG): VF = 32
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 72 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
+
+define i1 @or_reduction_neon(i32 %arg, ptr %ptr) {
+entry:
+ br label %loop
+exit:
+ ret i1 %reduction_next
+loop:
+ %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
+ %reduction = phi i1 [ 0, %entry ], [ %reduction_next, %loop ]
+ %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
+ %loaded = load i32, ptr %gep
+ %i1 = icmp eq i32 %loaded, %induction
+ %reduction_next = or i1 %i1, %reduction
+ %induction_next = add nuw i32 %induction, 1
+ %cond = icmp eq i32 %induction_next, %arg
+ br i1 %cond, label %exit, label %loop, !llvm.loop !32
+}
+
+; CHECK-LABEL: LV: Checking a loop in 'or_reduction_sve'
+; CHECK: LV(REG): VF = 64
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
+
+define i1 @or_reduction_sve(i32 %arg, ptr %ptr) vscale_range(2,2) "target-features"="+sve" {
+entry:
+ br label %loop
+exit:
+ ret i1 %reduction_next
+loop:
+ %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
+ %reduction = phi i1 [ true, %entry ], [ %reduction_next, %loop ]
+ %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
+ %loaded = load i32, ptr %gep
+ %i1 = icmp eq i32 %loaded, %induction
+ %reduction_next = or i1 %i1, %reduction
+ %induction_next = add nuw i32 %induction, 1
+ %cond = icmp eq i32 %induction_next, %arg
+ br i1 %cond, label %exit, label %loop, !llvm.loop !64
+}
+
+!32 = distinct !{!32, !33}
+!33 = !{!"llvm.loop.vectorize.width", i32 32}
+!64 = distinct !{!64, !65}
+!65 = !{!"llvm.loop.vectorize.width", i32 64}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll b/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll
new file mode 100644
index 0000000000000..4cab716c75448
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll
@@ -0,0 +1,32 @@
+; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 < %s | FileCheck %s
+; REQUIRES: asserts
+
+target triple = "x86_64"
+
+; Test that shows how many registers the loop vectorizer thinks an illegal <VF x i1> will consume.
+
+; CHECK-LABEL: LV: Checking a loop in 'or_reduction_avx' from <stdin>
+; CHECK: LV(REG): VF = 64
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
+
+define i1 @or_reduction_avx(i32 %arg, ptr %ptr) "target-features"="+avx" {
+entry:
+ br label %loop
+exit:
+ ret i1 %reduction_next
+loop:
+ %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
+ %reduction = phi i1 [ 0, %entry ], [ %reduction_next, %loop ]
+ %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
+ %loaded = load i32, ptr %gep
+ %i1 = icmp eq i32 %loaded, %induction
+ %reduction_next = or i1 %i1, %reduction
+ %induction_next = add nuw i32 %induction, 1
+ %cond = icmp eq i32 %induction_next, %arg
+ br i1 %cond, label %exit, label %loop, !llvm.loop !64
+}
+
+!64 = distinct !{!64, !65}
+!65 = !{!"llvm.loop.vectorize.width", i32 64}
More information about the llvm-commits
mailing list