[llvm] 060cfd9 - [AArch64][SVE]Add cost model for masked gather and scatter for scalable vector.

Mon Jan 4 06:01:25 PST 2021

Author: Caroline Concatto
Date: 2021-01-04T13:59:58Z
New Revision: 060cfd97954835c3be18e47c631d3efb3e374439

URL: https://github.com/llvm/llvm-project/commit/060cfd97954835c3be18e47c631d3efb3e374439
DIFF: https://github.com/llvm/llvm-project/commit/060cfd97954835c3be18e47c631d3efb3e374439.diff

LOG: [AArch64][SVE]Add cost model for masked gather and scatter for scalable vector.

    A new TTI interface has been added 'Optional <unsigned>getMaxVScale' that
    returns the maximum vscale for a given target.
    When known getMaxVScale is used to compute the cost of masked gather scatter
    for scalable vector.

    Depends on D92094

    Differential Revision: https://reviews.llvm.org/D93030

Added: 
    llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-gather.ll
    llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-scatter.ll

Modified: 
    llvm/include/llvm/Analysis/TargetTransformInfo.h
    llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
    llvm/include/llvm/CodeGen/BasicTTIImpl.h
    llvm/lib/Analysis/TargetTransformInfo.cpp
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 0953a3b3f451..d9d04429b181 100644

--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -928,6 +928,10 @@ class TargetTransformInfo {
   /// \return The width of the smallest vector register type.
   unsigned getMinVectorRegisterBitWidth() const;
 
+  /// \return The maximum value of vscale if the target specifies an
+  ///  architectural maximum vector length, and None otherwise.
+  Optional<unsigned> getMaxVScale() const;
+
   /// \return True if the vectorization factor should be chosen to
   /// make the vector of the smallest element type match the size of a
   /// vector register. For wider element types, this could result in
@@ -1504,6 +1508,7 @@ class TargetTransformInfo::Concept {
   virtual const char *getRegisterClassName(unsigned ClassID) const = 0;
   virtual unsigned getRegisterBitWidth(bool Vector) const = 0;
   virtual unsigned getMinVectorRegisterBitWidth() = 0;
+  virtual Optional<unsigned> getMaxVScale() const = 0;
   virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0;
   virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0;
   virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0;
@@ -1921,6 +1926,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   unsigned getMinVectorRegisterBitWidth() override {
     return Impl.getMinVectorRegisterBitWidth();
   }
+  Optional<unsigned> getMaxVScale() const override {
+    return Impl.getMaxVScale();
+  }
   bool shouldMaximizeVectorBandwidth(bool OptSize) const override {
     return Impl.shouldMaximizeVectorBandwidth(OptSize);
   }

diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 620bfb885b54..ef0653d0d9f4 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -369,6 +369,8 @@ class TargetTransformInfoImplBase {
 
   unsigned getMinVectorRegisterBitWidth() const { return 128; }
 
+  Optional<unsigned> getMaxVScale() const { return None; }
+
   bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; }
 
   unsigned getMinimumVF(unsigned ElemWidth) const { return 0; }

diff  --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 02f1b73226fc..9776c20400d6 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -571,6 +571,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
 
   unsigned getRegisterBitWidth(bool Vector) const { return 32; }
 
+  Optional<unsigned> getMaxVScale() const { return None; }
+
   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
   /// are set if the demanded result elements need to be inserted and/or
   /// extracted from vectors.
@@ -1239,8 +1241,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       return thisT()->getMemcpyCost(ICA.getInst());
 
     case Intrinsic::masked_scatter: {
-      if (isa<ScalableVectorType>(RetTy))
-        return BaseT::getIntrinsicInstrCost(ICA, CostKind);
       assert(VF.isScalar() && "Can't vectorize types here.");
       const Value *Mask = Args[3];
       bool VarMask = !isa<Constant>(Mask);
@@ -1250,8 +1250,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
                                              VarMask, Alignment, CostKind, I);
     }
     case Intrinsic::masked_gather: {
-      if (isa<ScalableVectorType>(RetTy))
-        return BaseT::getIntrinsicInstrCost(ICA, CostKind);
       assert(VF.isScalar() && "Can't vectorize types here.");
       const Value *Mask = Args[2];
       bool VarMask = !isa<Constant>(Mask);

diff  --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index becf74c64fd5..5100109959d6 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -627,6 +627,10 @@ unsigned TargetTransformInfo::getMinVectorRegisterBitWidth() const {
   return TTIImpl->getMinVectorRegisterBitWidth();
 }
 
+Optional<unsigned> TargetTransformInfo::getMaxVScale() const {
+  return TTIImpl->getMaxVScale();
+}
+
 bool TargetTransformInfo::shouldMaximizeVectorBandwidth(bool OptSize) const {
   return TTIImpl->shouldMaximizeVectorBandwidth(OptSize);
 }

diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index d97570755291..aaf7371c7933 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -770,6 +770,26 @@ AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   return Options;
 }
 
+unsigned AArch64TTIImpl::getGatherScatterOpCost(
+    unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
+    Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
+  auto *VT = cast<VectorType>(DataTy);
+  auto LT = TLI->getTypeLegalizationCost(DL, DataTy);
+  ElementCount LegalVF = LT.second.getVectorElementCount();
+  if (!LegalVF.isScalable())
+    return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
+                                         Alignment, CostKind, I);
+
+  Optional<unsigned> MaxNumVScale = getMaxVScale();
+  assert(MaxNumVScale && "Expected valid max vscale value");
+
+  unsigned MemOpCost =
+      getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);
+  unsigned MaxNumElementsPerGather =
+      MaxNumVScale.getValue() * LegalVF.getKnownMinValue();
+  return LT.first * MaxNumElementsPerGather * MemOpCost;
+}
+
 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
   return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
 }

diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index c8e721b1fb9f..7dded02b2a6f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -115,8 +115,19 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
     return ST->getMinVectorRegisterBitWidth();
   }
 
+  Optional<unsigned> getMaxVScale() const {
+    if (ST->hasSVE())
+      return AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock;
+    return BaseT::getMaxVScale();
+  }
+
   unsigned getMaxInterleaveFactor(unsigned VF);
 
+  unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+                                  const Value *Ptr, bool VariableMask,
+                                  Align Alignment, TTI::TargetCostKind CostKind,
+                                  const Instruction *I = nullptr);
+
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                        TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
                        const Instruction *I = nullptr);

diff  --git a/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-gather.ll b/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-gather.ll
new file mode 100644
index 000000000000..38b41b731dd0
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-gather.ll
@@ -0,0 +1,37 @@
+; Check getIntrinsicInstrCost in BasicTTIImpl.h for masked gather
+
+; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve  < %s 2>%t | FileCheck %s
+
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+define <vscale x 4 x i32> @masked_gather_nxv4i32(<vscale x 4 x i32*> %ld, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru) {
+; CHECK-LABEL: 'masked_gather_nxv4i32'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction:   %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %ld, i32 0, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:   ret <vscale x 4 x i32> %res
+  %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ld, i32 0, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 8 x i32> @masked_gather_nxv8i32(<vscale x 8 x i32*> %ld, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru) {
+; CHECK-LABEL: 'masked_gather_nxv8i32'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction:   %res = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> %ld, i32 0, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:   ret <vscale x 8 x i32> %res
+  %res = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x i32*> %ld, i32 0, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru)
+  ret <vscale x 8 x i32> %res
+}
+
+define <4 x i32> @masked_gather_v4i32(<4 x i32*> %ld, <4 x i1> %masks, <4 x i32> %passthru) {
+; CHECK-LABEL: 'masked_gather_v4i32'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 29 for instruction:   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+
+  %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
+  ret <4 x i32> %res
+}
+
+declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ptrs, i32 %align, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru)
+declare <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x i32*> %ptrs, i32 %align, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru)
+declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 %align, <4 x i1> %masks, <4 x i32> %passthru)

diff  --git a/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-scatter.ll b/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-scatter.ll
new file mode 100644
index 000000000000..4370922e4bf7
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-scatter.ll
@@ -0,0 +1,40 @@
+; Check getIntrinsicInstrCost in BasicTTIImpl.h with for  masked scatter
+
+; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve  < %s 2>%t | FileCheck %s
+
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+
+define void @masked_scatter_nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, <vscale x 4 x i1> %masks) {
+; CHECK-LABEL: 'masked_scatter_nxv4i32'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction:   call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:   ret void
+
+  call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+define void @masked_scatter_nxv8i32(<vscale x 8 x i32> %data, <vscale x 8 x i32*> %ptrs, <vscale x 8 x i1> %masks) {
+; CHECK-LABEL: 'masked_scatter_nxv8i32'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction:   call void @llvm.masked.scatter.nxv8i32.nxv8p0i32(<vscale x 8 x i32> %data, <vscale x 8 x i32*> %ptrs, i32 0, <vscale x 8 x i1> %masks)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:   ret void
+
+  call void @llvm.masked.scatter.nxv8i32(<vscale x 8 x i32> %data, <vscale x 8 x i32*> %ptrs, i32 0, <vscale x 8 x i1> %masks)
+  ret void
+}
+
+define void @masked_scatter_v4i32(<4 x i32> %data, <4 x i32*> %ptrs, <4 x i1> %masks) {
+; CHECK-LABEL: 'masked_scatter_v4i32'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 29 for instruction:   call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %data, <4 x i32*> %ptrs, i32 0, <4 x i1> %masks)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:   ret void
+
+  call void @llvm.masked.scatter.v4i32(<4 x i32> %data, <4 x i32*> %ptrs, i32 0, <4 x i1> %masks)
+  ret void
+}
+
+declare void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, i32 %align, <vscale x 4 x i1> %masks)
+declare void @llvm.masked.scatter.nxv8i32(<vscale x 8 x i32> %data, <vscale x 8 x i32*> %ptrs, i32 %align, <vscale x 8 x i1> %masks)
+declare void @llvm.masked.scatter.v4i32(<4 x i32> %data, <4 x i32*> %ptrs, i32 %align, <4 x i1> %masks)