[llvm] 718962f - [SystemZ] Provide improved cost estimates (#83873)

Mon Mar 11 02:41:03 PDT 2024

Author: Dominik Steenken
Date: 2024-03-11T10:40:59+01:00
New Revision: 718962f53bfc610f670f1674457a426e01117097

URL: https://github.com/llvm/llvm-project/commit/718962f53bfc610f670f1674457a426e01117097
DIFF: https://github.com/llvm/llvm-project/commit/718962f53bfc610f670f1674457a426e01117097.diff

LOG: [SystemZ] Provide improved cost estimates (#83873)

This commit provides better cost estimates for
the llvm.vector.reduce.add intrinsic on SystemZ. These apply to all
vector lengths and integer types up to i128. For integer types larger
than i128, we fall back to the default cost estimate.

This has the effect of lowering the estimated costs of most common
instances of the intrinsic. The expected performance impact of this is
minimal with a tendency to slightly improve performance of some
benchmarks.

This commit also provides a test to check the proper computation of the
new estimates, as well as the fallback for types larger than i128.

Added: 
    llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll

Modified: 
    llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 9370fb51a96c56..e4adb7be564952 100644

--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -20,6 +20,8 @@
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "systemztti"
@@ -1284,17 +1286,42 @@ InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost(
   return NumVectorMemOps + NumPermutes;
 }
 
-static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy) {
+static int
+getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+                            const SmallVectorImpl<Type *> &ParamTys) {
   if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
     return getNumVectorRegs(RetTy); // VPERM
+
+  if (ID == Intrinsic::vector_reduce_add) {
+    // Retrieve number and size of elements for the vector op.
+    auto *VTy = cast<FixedVectorType>(ParamTys.front());
+    unsigned NumElements = VTy->getNumElements();
+    unsigned ScalarSize = VTy->getScalarSizeInBits();
+    // For scalar sizes >128 bits, we fall back to the generic cost estimate.
+    if (ScalarSize > SystemZ::VectorBits)
+      return -1;
+    // A single vector register can hold this many elements.
+    unsigned MaxElemsPerVector = SystemZ::VectorBits / ScalarSize;
+    // This many vector regs are needed to represent the input elements (V).
+    unsigned VectorRegsNeeded = getNumVectorRegs(VTy);
+    // This many instructions are needed for the final sum of vector elems (S).
+    unsigned LastVectorHandling =
+        2 * Log2_32_Ceil(std::min(NumElements, MaxElemsPerVector));
+    // We use vector adds to create a sum vector, which takes
+    // V/2 + V/4 + ... = V - 1 operations.
+    // Then, we need S operations to sum up the elements of that sum vector,
+    // for a total of V + S - 1 operations.
+    int Cost = VectorRegsNeeded + LastVectorHandling - 1;
+    return Cost;
+  }
   return -1;
 }
 
 InstructionCost
 SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                       TTI::TargetCostKind CostKind) {
-  InstructionCost Cost =
-      getVectorIntrinsicInstrCost(ICA.getID(), ICA.getReturnType());
+  InstructionCost Cost = getVectorIntrinsicInstrCost(
+      ICA.getID(), ICA.getReturnType(), ICA.getArgTypes());
   if (Cost != -1)
     return Cost;
   return BaseT::getIntrinsicInstrCost(ICA, CostKind);

diff  --git a/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll b/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll
new file mode 100644
index 00000000000000..061e5ece44a4e7
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll
@@ -0,0 +1,128 @@
+; RUN: opt < %s -mtriple=systemz-unknown -mcpu=z13 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s
+
+define void @reduce(ptr %src, ptr %dst) {
+; CHECK-LABEL: 'reduce'
+; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %R2_64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %V2_64)
+; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R4_64 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %V4_64)
+; CHECK:  Cost Model: Found an estimated cost of 5 for instruction: %R8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %V8_64)
+; CHECK:  Cost Model: Found an estimated cost of 9 for instruction: %R16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %V16_64)
+; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %R2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %V2_32)
+; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %R4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %V4_32)
+; CHECK:  Cost Model: Found an estimated cost of 5 for instruction: %R8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %V8_32)
+; CHECK:  Cost Model: Found an estimated cost of 7 for instruction: %R16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %V16_32)
+; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %R2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %V2_16)
+; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %R4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %V4_16)
+; CHECK:  Cost Model: Found an estimated cost of 6 for instruction: %R8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %V8_16)
+; CHECK:  Cost Model: Found an estimated cost of 7 for instruction: %R16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %V16_16)
+; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %R2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %V2_8)
+; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %R4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %V4_8)
+; CHECK:  Cost Model: Found an estimated cost of 6 for instruction: %R8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %V8_8)
+; CHECK:  Cost Model: Found an estimated cost of 8 for instruction: %R16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %V16_8)
+;
+; CHECK:  Cost Model: Found an estimated cost of 15 for instruction: %R128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %V128_8)
+; CHECK:  Cost Model: Found an estimated cost of 20 for instruction: %R4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> %V4_256)
+
+  ; REDUCEADD64
+
+  %V2_64 = load <2 x i64>, ptr %src, align 8
+  %R2_64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %V2_64)
+  store volatile i64 %R2_64, ptr %dst, align 4
+
+  %V4_64 = load <4 x i64>, ptr %src, align 8
+  %R4_64 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %V4_64)
+  store volatile i64 %R4_64, ptr %dst, align 4
+
+  %V8_64 = load <8 x i64>, ptr %src, align 8
+  %R8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %V8_64)
+  store volatile i64 %R8_64, ptr %dst, align 4
+
+  %V16_64 = load <16 x i64>, ptr %src, align 8
+  %R16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %V16_64)
+  store volatile i64 %R16_64, ptr %dst, align 4
+
+  ; REDUCEADD32
+
+  %V2_32 = load <2 x i32>, ptr %src, align 8
+  %R2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %V2_32)
+  store volatile i32 %R2_32, ptr %dst, align 4
+
+  %V4_32 = load <4 x i32>, ptr %src, align 8
+  %R4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %V4_32)
+  store volatile i32 %R4_32, ptr %dst, align 4
+
+  %V8_32 = load <8 x i32>, ptr %src, align 8
+  %R8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %V8_32)
+  store volatile i32 %R8_32, ptr %dst, align 4
+
+  %V16_32 = load <16 x i32>, ptr %src, align 8
+  %R16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %V16_32)
+  store volatile i32 %R16_32, ptr %dst, align 4
+
+  ; REDUCEADD16
+
+  %V2_16 = load <2 x i16>, ptr %src, align 8
+  %R2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %V2_16)
+  store volatile i16 %R2_16, ptr %dst, align 4
+
+  %V4_16 = load <4 x i16>, ptr %src, align 8
+  %R4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %V4_16)
+  store volatile i16 %R4_16, ptr %dst, align 4
+
+  %V8_16 = load <8 x i16>, ptr %src, align 8
+  %R8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %V8_16)
+  store volatile i16 %R8_16, ptr %dst, align 4
+
+  %V16_16 = load <16 x i16>, ptr %src, align 8
+  %R16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %V16_16)
+  store volatile i16 %R16_16, ptr %dst, align 4
+
+  ; REDUCEADD8
+
+  %V2_8 = load <2 x i8>, ptr %src, align 8
+  %R2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %V2_8)
+  store volatile i8 %R2_8, ptr %dst, align 4
+
+  %V4_8 = load <4 x i8>, ptr %src, align 8
+  %R4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %V4_8)
+  store volatile i8 %R4_8, ptr %dst, align 4
+
+  %V8_8 = load <8 x i8>, ptr %src, align 8
+  %R8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %V8_8)
+  store volatile i8 %R8_8, ptr %dst, align 4
+
+  %V16_8 = load <16 x i8>, ptr %src, align 8
+  %R16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %V16_8)
+  store volatile i8 %R16_8, ptr %dst, align 4
+
+  ; EXTREME VALUES
+
+  %V128_8 = load <128 x i8>, ptr %src, align 8
+  %R128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %V128_8)
+  store volatile i8 %R128_8, ptr %dst, align 4
+
+  %V4_256 = load <4 x i256>, ptr %src, align 8
+  %R4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> %V4_256)
+  store volatile i256 %R4_256, ptr %dst, align 8
+
+  ret void
+}
+
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
+declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
+
+declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
+declare i256 @llvm.vector.reduce.add.v4i256(<4 x i256>)