[PATCH] D147075: [SLP] Only use const args for intrinsic cost if all match.

Tue Mar 28 13:00:37 PDT 2023

fhahn created this revision.
fhahn added reviewers: ABataev, RKSimon, dtemirbulatov.
Herald added subscribers: vporpo, StephenFan, hiraditya.
Herald added a project: All.
fhahn requested review of this revision.
Herald added a subscriber: pcwang-thead.
Herald added a project: LLVM.

At the moment, the SLP vectorizer uses the arguments for the first
call in the bundle to the intrinsic cost estimate. This can lead to
mis-classifications, causing the SLP vectorizer to underestimate the
vector call cost.

The main case is constant arguments, where incorrectly assuming the same
contant is passed for all vector lanes can impact the cost computation.
An example where this estimate leads to sub-optimal code is fshl calls,
illustrated in the test case.

This patch updates SLP to more careful and not use arguments if
different constants are passed in different lanes.

This was exposed by 55c600819f92ed33bef868b5056b699915d645fa <https://reviews.llvm.org/rG55c600819f92ed33bef868b5056b699915d645fa>/D140392 <https://reviews.llvm.org/D140392>.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D147075

Files:
  llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
  llvm/test/Transforms/SLPVectorizer/AArch64/fshl.ll


Index: llvm/test/Transforms/SLPVectorizer/AArch64/fshl.ll
===================================================================

--- llvm/test/Transforms/SLPVectorizer/AArch64/fshl.ll
+++ llvm/test/Transforms/SLPVectorizer/AArch64/fshl.ll
@@ -6,21 +6,18 @@
 ; CHECK-LABEL: define i64 @fshl
 ; CHECK-SAME: (i64 [[OR1:%.*]], i64 [[OR2:%.*]], i64 [[OR3:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i64> poison, i64 [[OR2]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[OR3]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> zeroinitializer, <2 x i64> <i64 1, i64 2>)
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[OR1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> <i64 poison, i64 0>, <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> <i64 17, i64 21>)
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> <i64 0, i64 poison>, <2 x i32> <i32 2, i32 0>
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i64> [[TMP2]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[TMP7]], [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <2 x i64> [[TMP5]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1
-; CHECK-NEXT:    [[ADD3:%.*]] = or i64 [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
-; CHECK-NEXT:    [[XOR5:%.*]] = xor i64 [[ADD3]], [[TMP12]]
+; CHECK-NEXT:    [[OR4:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[OR2]], i64 0, i64 1)
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i64 [[OR4]], 0
+; CHECK-NEXT:    [[OR5:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[OR3]], i64 0, i64 2)
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i64 [[OR5]], [[OR1]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add i64 [[XOR1]], [[OR1]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i64 0, [[XOR2]]
+; CHECK-NEXT:    [[OR6:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[OR1]], i64 [[OR2]], i64 17)
+; CHECK-NEXT:    [[XOR3:%.*]] = xor i64 [[OR6]], [[ADD1]]
+; CHECK-NEXT:    [[OR7:%.*]] = tail call i64 @llvm.fshl.i64(i64 0, i64 0, i64 21)
+; CHECK-NEXT:    [[XOR4:%.*]] = xor i64 [[OR7]], [[ADD2]]
+; CHECK-NEXT:    [[ADD3:%.*]] = or i64 [[XOR3]], [[ADD2]]
+; CHECK-NEXT:    [[XOR5:%.*]] = xor i64 [[ADD3]], [[XOR4]]
 ; CHECK-NEXT:    ret i64 [[XOR5]]
 ;
 entry:
Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6253,21 +6253,37 @@
 }
 
 static std::pair<InstructionCost, InstructionCost>
-getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
+getVectorCallCosts(ArrayRef<Value *> CIs, FixedVectorType *VecTy,
                    TargetTransformInfo *TTI, TargetLibraryInfo *TLI) {
+  auto *CI = cast<CallInst>(CIs[0]);
   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
   // Calculate the cost of the scalar and vector calls.
   SmallVector<Type *, 4> VecTys;
-  for (Use &Arg : CI->args())
+  SmallVector<const Value *> Args;
+  for (Use &Arg : CI->args()) {
     VecTys.push_back(
         FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
+    Args.push_back(Arg);
+  }
+
+  bool CanUseArgs = true;
+  for (Value *C : CIs) {
+    auto *CI = cast<CallInst>(C);
+    if (!all_of(enumerate(CI->args()), [&Args](const auto &A) {
+          return !isa<Constant>(A.value()) || A.value() == Args[A.index()];
+        })) {
+      Args.clear();
+      break;
+    }
+  }
+
   FastMathFlags FMF;
   if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
     FMF = FPCI->getFastMathFlags();
-  SmallVector<const Value *> Arguments(CI->args());
-  IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys, FMF,
-                                    dyn_cast<IntrinsicInst>(CI));
+  IntrinsicCostAttributes CostAttrs(ID, VecTy, Args, VecTys, FMF,
+                                    !Args.empty() ? dyn_cast<IntrinsicInst>(CI)
+                                                  : nullptr);
   auto IntrinsicCost =
     TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
 
@@ -7629,8 +7645,7 @@
                                    CI->getFunctionType()->params(), CostKind);
     };
     auto GetVectorCost = [=](InstructionCost CommonCost) {
-      auto *CI = cast<CallInst>(VL0);
-      auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
+      auto VecCallCosts = getVectorCallCosts(VL, VecTy, TTI, TLI);
       return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
     };
     return GetCostDiff(GetScalarCost, GetVectorCost);


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D147075.509108.patch
Type: text/x-patch
Size: 4880 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20230328/cffc0a39/attachment.bin>