[llvm] ade47bd - [LV] Improve register pressure estimate at high VFs

Mon May 23 01:01:47 PDT 2022

Author: Peter Waller
Date: 2022-05-23T07:57:45Z
New Revision: ade47bdc317bda57fe0f2a741e5bf271cfc44054

URL: https://github.com/llvm/llvm-project/commit/ade47bdc317bda57fe0f2a741e5bf271cfc44054
DIFF: https://github.com/llvm/llvm-project/commit/ade47bdc317bda57fe0f2a741e5bf271cfc44054.diff

LOG: [LV] Improve register pressure estimate at high VFs

Previously, `getRegUsageForType` was implemented using
`getTypeLegalizationCost`.  `getRegUsageForType` is used by the loop
vectorizer to estimate the register pressure caused by using a vector
type.  However, `getTypeLegalizationCost` currently only appears to
understand splitting and not scalarization, so significantly
underestimates the register requirements.

Instead, use `getNumRegisters`, which understands when scalarization
can occur (via computeRegisterProperties).

This was discovered while investigating D118979 (Set maximum VF with
shouldMaximizeVectorBandwidth), where under fixed-length 512-bit SVE the
loop vectorizer previously ends up costing an v128i1 as 2 v64i*
registers where it actually occupies 128 i32 registers.

I'm sending this patch early for comment, I'm still doing some sanity checking
with LNT.  I note that getRegisterClassForType appears to return VectorRC even
though the type in question (large vNi1 types) end up occupying scalar
registers. That might be worth fixing too.

Differential Revision: https://reviews.llvm.org/D125918

Added: 
    llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll
    llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll

Modified: 
    llvm/include/llvm/Analysis/TargetTransformInfo.h
    llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
    llvm/include/llvm/CodeGen/BasicTTIImpl.h
    llvm/lib/Analysis/TargetTransformInfo.cpp
    llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
    llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 376e1d13cdc10..4704697b40d2e 100644

--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -730,7 +730,7 @@ class TargetTransformInfo {
   bool isTypeLegal(Type *Ty) const;
 
   /// Returns the estimated number of registers required to represent \p Ty.
-  InstructionCost getRegUsageForType(Type *Ty) const;
+  unsigned getRegUsageForType(Type *Ty) const;
 
   /// Return true if switches should be turned into lookup tables for the
   /// target.
@@ -1593,7 +1593,7 @@ class TargetTransformInfo::Concept {
   virtual bool isProfitableToHoist(Instruction *I) = 0;
   virtual bool useAA() = 0;
   virtual bool isTypeLegal(Type *Ty) = 0;
-  virtual InstructionCost getRegUsageForType(Type *Ty) = 0;
+  virtual unsigned getRegUsageForType(Type *Ty) = 0;
   virtual bool shouldBuildLookupTables() = 0;
   virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
   virtual bool shouldBuildRelLookupTables() = 0;
@@ -2032,7 +2032,7 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   }
   bool useAA() override { return Impl.useAA(); }
   bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); }
-  InstructionCost getRegUsageForType(Type *Ty) override {
+  unsigned getRegUsageForType(Type *Ty) override {
     return Impl.getRegUsageForType(Ty);
   }
   bool shouldBuildLookupTables() override {

diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index fe937adde411b..383b91c7af13d 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -312,7 +312,7 @@ class TargetTransformInfoImplBase {
 
   bool isTypeLegal(Type *Ty) const { return false; }
 
-  InstructionCost getRegUsageForType(Type *Ty) const { return 1; }
+  unsigned getRegUsageForType(Type *Ty) const { return 1; }
 
   bool shouldBuildLookupTables() const { return true; }
 

diff  --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index a2b9860c46e6c..c1b9533b6fc1e 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -382,10 +382,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return getTLI()->isTypeLegal(VT);
   }
 
-  InstructionCost getRegUsageForType(Type *Ty) {
-    InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first;
-    assert(Val >= 0 && "Negative cost!");
-    return Val;
+  unsigned getRegUsageForType(Type *Ty) {
+    EVT ETy = getTLI()->getValueType(DL, Ty);
+    return getTLI()->getNumRegisters(Ty->getContext(), ETy);
   }
 
   InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,

diff  --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 384d95a5b70e5..3e4cacf17a40d 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -473,7 +473,7 @@ bool TargetTransformInfo::isTypeLegal(Type *Ty) const {
   return TTIImpl->isTypeLegal(Ty);
 }
 
-InstructionCost TargetTransformInfo::getRegUsageForType(Type *Ty) const {
+unsigned TargetTransformInfo::getRegUsageForType(Type *Ty) const {
   return TTIImpl->getRegUsageForType(Ty);
 }
 

diff  --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 435d192d4912f..1822640c74bae 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -429,7 +429,7 @@ void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
   BaseT::getPeelingPreferences(L, SE, PP);
 }
 
-InstructionCost RISCVTTIImpl::getRegUsageForType(Type *Ty) {
+unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
   TypeSize Size = Ty->getPrimitiveSizeInBits();
   if (Ty->isVectorTy()) {
     if (Size.isScalable() && ST->hasVInstructions())

diff  --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index cdec074dc2a46..bab157dbfef0c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -60,7 +60,7 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
 
   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
 
-  InstructionCost getRegUsageForType(Type *Ty);
+  unsigned getRegUsageForType(Type *Ty);
 
   InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                         Align Alignment, unsigned AddressSpace,

diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f24622ee620a2..6a4159b06d577 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5987,16 +5987,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
 
   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
 
-  // A lambda that gets the register usage for the given type and VF.
-  const auto &TTICapture = TTI;
-  auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
+  auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned {
     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
       return 0;
-    InstructionCost::CostType RegUsage =
-        *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
-    assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
-           "Nonsensical values for register usage.");
-    return RegUsage;
+    return TTI.getRegUsageForType(VectorType::get(Ty, VF));
   };
 
   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll
new file mode 100644
index 0000000000000..f0dc8e502769d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll
@@ -0,0 +1,57 @@
+; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 < %s | FileCheck %s
+; REQUIRES: asserts
+
+target triple = "aarch64"
+
+; Test that shows how many registers the loop vectorizer thinks an illegal <VF x i1> will consume.
+
+; CHECK-LABEL: LV: Checking a loop in 'or_reduction_neon' from <stdin>
+; CHECK: LV(REG): VF = 32
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 72 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
+
+define i1 @or_reduction_neon(i32 %arg, ptr %ptr) {
+entry:
+  br label %loop
+exit:
+  ret i1 %reduction_next
+loop:
+  %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
+  %reduction = phi i1 [ 0, %entry ], [ %reduction_next, %loop ]
+  %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
+  %loaded = load i32, ptr %gep
+  %i1 = icmp eq i32 %loaded, %induction
+  %reduction_next = or i1 %i1, %reduction
+  %induction_next = add nuw i32 %induction, 1
+  %cond = icmp eq i32 %induction_next, %arg
+  br i1 %cond, label %exit, label %loop, !llvm.loop !32
+}
+
+; CHECK-LABEL: LV: Checking a loop in 'or_reduction_sve'
+; CHECK: LV(REG): VF = 64
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
+
+define i1 @or_reduction_sve(i32 %arg, ptr %ptr) vscale_range(2,2) "target-features"="+sve" {
+entry:
+  br label %loop
+exit:
+  ret i1 %reduction_next
+loop:
+  %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
+  %reduction = phi i1 [ true, %entry ], [ %reduction_next, %loop ]
+  %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
+  %loaded = load i32, ptr %gep
+  %i1 = icmp eq i32 %loaded, %induction
+  %reduction_next = or i1 %i1, %reduction
+  %induction_next = add nuw i32 %induction, 1
+  %cond = icmp eq i32 %induction_next, %arg
+  br i1 %cond, label %exit, label %loop, !llvm.loop !64
+}
+
+!32 = distinct !{!32, !33}
+!33 = !{!"llvm.loop.vectorize.width", i32 32}
+!64 = distinct !{!64, !65}
+!65 = !{!"llvm.loop.vectorize.width", i32 64}

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll b/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll
new file mode 100644
index 0000000000000..4cab716c75448
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll
@@ -0,0 +1,32 @@
+; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 < %s | FileCheck %s
+; REQUIRES: asserts
+
+target triple = "x86_64"
+
+; Test that shows how many registers the loop vectorizer thinks an illegal <VF x i1> will consume.
+
+; CHECK-LABEL: LV: Checking a loop in 'or_reduction_avx' from <stdin>
+; CHECK: LV(REG): VF = 64
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
+
+define i1 @or_reduction_avx(i32 %arg, ptr %ptr) "target-features"="+avx" {
+entry:
+  br label %loop
+exit:
+  ret i1 %reduction_next
+loop:
+  %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
+  %reduction = phi i1 [ 0, %entry ], [ %reduction_next, %loop ]
+  %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
+  %loaded = load i32, ptr %gep
+  %i1 = icmp eq i32 %loaded, %induction
+  %reduction_next = or i1 %i1, %reduction
+  %induction_next = add nuw i32 %induction, 1
+  %cond = icmp eq i32 %induction_next, %arg
+  br i1 %cond, label %exit, label %loop, !llvm.loop !64
+}
+
+!64 = distinct !{!64, !65}
+!65 = !{!"llvm.loop.vectorize.width", i32 64}