[llvm] f7a7efb - [AMDGPU] Tweak getTypeLegalizationCost()

Mon Jul 6 14:07:57 PDT 2020

Author: Stanislav Mekhanoshin
Date: 2020-07-06T14:07:48-07:00
New Revision: f7a7efbf88b72b4aa6bd95a1ded6dacd2237f2f8

URL: https://github.com/llvm/llvm-project/commit/f7a7efbf88b72b4aa6bd95a1ded6dacd2237f2f8
DIFF: https://github.com/llvm/llvm-project/commit/f7a7efbf88b72b4aa6bd95a1ded6dacd2237f2f8.diff

LOG: [AMDGPU] Tweak getTypeLegalizationCost()

Even though wide vectors are legal they still cost more as we
will have to eventually split them. Not all operations can
be uniformly done on vector types.

Conservatively add the cost of splitting at least to 8 dwords,
which is our widest possible load.

We are more or less lying to cost mode with this change but
this can prevent vectorizer from creation of wide vectors which
results in RA problems for us.

Differential Revision: https://reviews.llvm.org/D83078

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.h
    llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
    llvm/test/Analysis/CostModel/AMDGPU/mul.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a3135a787639..d90272848500 100644

--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11690,3 +11690,18 @@ bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
   SmallPtrSet<const Value *, 16> Visited;
   return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
 }
+
+std::pair<int, MVT>
+SITargetLowering::getTypeLegalizationCost(const DataLayout &DL,
+                                          Type *Ty) const {
+  auto Cost = TargetLoweringBase::getTypeLegalizationCost(DL, Ty);
+  auto Size = DL.getTypeSizeInBits(Ty);
+  // Maximum load or store can handle 8 dwords for scalar and 4 for
+  // vector ALU. Let's assume anything above 8 dwords is expensive
+  // even if legal.
+  if (Size <= 256)
+    return Cost;
+
+  Cost.first = (Size + 255) / 256;
+  return Cost;
+}

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index ffe9140d3d07..f4c076464057 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -464,6 +464,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                                       MachineFunction &MF,
                                       const SIRegisterInfo &TRI,
                                       SIMachineFunctionInfo &Info) const;
+
+  std::pair<int, MVT> getTypeLegalizationCost(const DataLayout &DL,
+                                              Type *Ty) const;
 };
 
 } // End namespace llvm

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
index 9a2c01058b28..609769fd5148 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
@@ -90,7 +90,7 @@ define amdgpu_kernel void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> add
 }
 
 ; ALL: 'add_v16i64'
-; ALL: estimated cost of 32 for {{.*}} add <16 x i64>
+; ALL: estimated cost of 128 for {{.*}} add <16 x i64>
 define amdgpu_kernel void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %vaddr, <16 x i64> %b) #0 {
   %vec = load <16 x i64>, <16 x i64> addrspace(1)* %vaddr
   %add = add <16 x i64> %vec, %b

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
index 4d8a66ecd429..fa36d391f9c3 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
@@ -90,7 +90,7 @@ define amdgpu_kernel void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> add
 
 
 ; ALL: 'mul_v8i64'
-; ALL: estimated cost of 128 for {{.*}} mul <8 x i64>
+; ALL: estimated cost of 256 for {{.*}} mul <8 x i64>
 define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 {
   %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
   %mul = mul <8 x i64> %vec, %b