[llvm] 4d425f8 - [PowerPC] vector cost model add cost to extract i1

Mon Aug 14 14:04:33 PDT 2023

Author: Roland Froese
Date: 2023-08-14T17:04:11-04:00
New Revision: 4d425f86632fc2a7142d2ba1c8d67a19c620d355

URL: https://github.com/llvm/llvm-project/commit/4d425f86632fc2a7142d2ba1c8d67a19c620d355
DIFF: https://github.com/llvm/llvm-project/commit/4d425f86632fc2a7142d2ba1c8d67a19c620d355.diff

LOG: [PowerPC] vector cost model add cost to extract i1

Try to avoid some unprofitable predication on PPC. Recognize in the cost model that computing on i1 values will require extra mask or compare operation.

Differential Revision: https://reviews.llvm.org/D155876

Added: 
    llvm/test/Transforms/LoopVectorize/PowerPC/predcost.ll

Modified: 
    llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
    llvm/test/Analysis/CostModel/PowerPC/reduce-and.ll
    llvm/test/Analysis/CostModel/PowerPC/reduce-or.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 8137b61f4982b9..ca0f2c2e18af5f 100644

--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -27,6 +27,9 @@ using namespace llvm;
 
 #define DEBUG_TYPE "ppctti"
 
+static cl::opt<bool> VecMaskCost("ppc-vec-mask-cost",
+cl::desc("add masking cost for i1 vectors"), cl::init(true), cl::Hidden);
+
 static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
 
@@ -700,6 +703,9 @@ InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
     return Cost;
 
   } else if (Val->getScalarType()->isIntegerTy() && Index != -1U) {
+    unsigned EltSize = Val->getScalarSizeInBits();
+    // Computing on 1 bit values requires extra mask or compare operations.
+    unsigned MaskCost = VecMaskCost && EltSize == 1 ? 1 : 0;
     if (ST->hasP9Altivec()) {
       if (ISD == ISD::INSERT_VECTOR_ELT)
         // A move-to VSR and a permute/insert.  Assume vector operation cost
@@ -721,12 +727,15 @@ InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
       // We need a vector extract (or mfvsrld).  Assume vector operation cost.
       // The cost of the load constant for a vector extract is disregarded
       // (invariant, easily schedulable).
-      return CostFactor;
+      return CostFactor + MaskCost;
 
-    } else if (ST->hasDirectMove())
+    } else if (ST->hasDirectMove()) {
       // Assume permute has standard cost.
       // Assume move-to/move-from VSR have 2x standard cost.
-      return 3;
+      if (ISD == ISD::INSERT_VECTOR_ELT)
+        return 3;
+      return 3 + MaskCost;
+    }
   }
 
   // Estimated cost of a load-hit-store delay.  This was obtained

diff  --git a/llvm/test/Analysis/CostModel/PowerPC/reduce-and.ll b/llvm/test/Analysis/CostModel/PowerPC/reduce-and.ll
index 3354664512e12d..00ec3fe6623140 100644
--- a/llvm/test/Analysis/CostModel/PowerPC/reduce-and.ll
+++ b/llvm/test/Analysis/CostModel/PowerPC/reduce-and.ll
@@ -3,14 +3,14 @@
 
 define i32 @reduce_i1(i32 %arg) {
 ; CHECK-LABEL: 'reduce_i1'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 193 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 386 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 257 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 514 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V1   = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)

diff  --git a/llvm/test/Analysis/CostModel/PowerPC/reduce-or.ll b/llvm/test/Analysis/CostModel/PowerPC/reduce-or.ll
index d33cfdd88dc664..5ae89592ddd94e 100644
--- a/llvm/test/Analysis/CostModel/PowerPC/reduce-or.ll
+++ b/llvm/test/Analysis/CostModel/PowerPC/reduce-or.ll
@@ -3,14 +3,14 @@
 
 define i32 @reduce_i1(i32 %arg) {
 ; CHECK-LABEL: 'reduce_i1'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 193 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 386 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 257 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 514 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V1   = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)

diff  --git a/llvm/test/Transforms/LoopVectorize/PowerPC/predcost.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/predcost.ll
new file mode 100644
index 00000000000000..be2dfd4739e446
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/predcost.ll
@@ -0,0 +1,29 @@
+; RUN: opt -ppc-vec-mask-cost=true -aa-pipeline=basic-aa -mcpu=pwr8 -S -passes=loop-vectorize < %s | FileCheck %s
+
+target datalayout = "e-m:e-Fn32-i64:64-n32:64-S128-v256:256:256-v512:512:512"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+define dso_local void @_tc(ptr nocapture noundef %aaa, i64 noundef %bbb) local_unnamed_addr {
+; CHECK-NOT: extractelement <16 x i1>
+entry:
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  ret void
+
+for.body:                                         ; preds = %for.inc, %entry
+  %i.08 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %aaa, i64 %i.08
+  %0 = load i8, ptr %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  store i8 32, ptr %arrayidx, align 1
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %for.body
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, %bbb
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}