[llvm] 3bd161e - [LV] Honor forced scalars in setVectorizedCallDecision.

Tue Sep 3 13:07:13 PDT 2024

Author: Florian Hahn
Date: 2024-09-03T21:06:32+01:00
New Revision: 3bd161e98d89d31696002994771b7761f1c74859

URL: https://github.com/llvm/llvm-project/commit/3bd161e98d89d31696002994771b7761f1c74859
DIFF: https://github.com/llvm/llvm-project/commit/3bd161e98d89d31696002994771b7761f1c74859.diff

LOG: [LV] Honor forced scalars in setVectorizedCallDecision.

Similarly to dd94537b4, setVectorizedCallDecision also did not consider
ForcedScalars. This lead to VPlans not reflecting the decision by the
legacy cost model (cost computation would use scalar cost, VPlan would
have VPWidenCallRecipe).

To fix this, check if the call has been forced to scalar in
setVectorizedCallDecision.

Note that this requires moving setVectorizedCallDecision after
collectLoopUniforms (which sets ForcedScalars). collectLoopUniforms does
not depend on call decisions and can safely be moved.

Fixes https://github.com/llvm/llvm-project/issues/107051.

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 17050b2b433caa..0200525a718d5f 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1290,8 +1290,8 @@ class LoopVectorizationCostModel {
     if (VF.isScalar() || Uniforms.contains(VF))
       return;
     setCostBasedWideningDecision(VF);
-    setVectorizedCallDecision(VF);
     collectLoopUniforms(VF);
+    setVectorizedCallDecision(VF);
     collectLoopScalars(VF);
   }
 
@@ -6194,6 +6194,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
   assert(!VF.isScalar() &&
          "Trying to set a vectorization decision for a scalar VF");
 
+  auto ForcedScalar = ForcedScalars.find(VF);
   for (BasicBlock *BB : TheLoop->blocks()) {
     // For each instruction in the old loop.
     for (Instruction &I : *BB) {
@@ -6206,14 +6207,37 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
       InstructionCost VectorCost = InstructionCost::getInvalid();
       InstructionCost IntrinsicCost = InstructionCost::getInvalid();
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
       Function *ScalarFunc = CI->getCalledFunction();
       Type *ScalarRetTy = CI->getType();
       SmallVector<Type *, 4> Tys, ScalarTys;
-      bool MaskRequired = Legal->isMaskRequired(CI);
       for (auto &ArgOp : CI->args())
         ScalarTys.push_back(ArgOp->getType());
 
+      // Estimate cost of scalarized vector call. The source operands are
+      // assumed to be vectors, so we need to extract individual elements from
+      // there, execute VF scalar calls, and then gather the result into the
+      // vector return value.
+      InstructionCost ScalarCallCost =
+          TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
+
+      // Compute costs of unpacking argument values for the scalar calls and
+      // packing the return values to a vector.
+      InstructionCost ScalarizationCost =
+          getScalarizationOverhead(CI, VF, CostKind);
+
+      ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
+      // Honor ForcedScalars decision.
+      // TODO: For calls, it might still be more profitable to widen. Use
+      // VPlan-based cost model to compare 
diff erent options.
+      if (VF.isVector() && ForcedScalar != ForcedScalars.end() &&
+          ForcedScalar->second.contains(CI)) {
+        setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
+                                Intrinsic::not_intrinsic, std::nullopt,
+                                ScalarCost);
+        continue;
+      }
+
+      bool MaskRequired = Legal->isMaskRequired(CI);
       // Compute corresponding vector type for return value and arguments.
       Type *RetTy = ToVectorTy(ScalarRetTy, VF);
       for (Type *ScalarTy : ScalarTys)
@@ -6229,20 +6253,6 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
           continue;
         }
 
-      // Estimate cost of scalarized vector call. The source operands are
-      // assumed to be vectors, so we need to extract individual elements from
-      // there, execute VF scalar calls, and then gather the result into the
-      // vector return value.
-      InstructionCost ScalarCallCost =
-          TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
-
-      // Compute costs of unpacking argument values for the scalar calls and
-      // packing the return values to a vector.
-      InstructionCost ScalarizationCost =
-          getScalarizationOverhead(CI, VF, CostKind);
-
-      ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
-
       // Find the cost of vectorizing the call, if we can find a suitable
       // vector variant of the function.
       bool UsesMask = false;

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
index a3f9459f2fc67e..4eba40ff91f009 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
@@ -178,7 +178,57 @@ exit:
   ret void
 }
 
-declare double @llvm.sqrt.f64(double) #0
+define void @call_forced_scalar(ptr %src.1, ptr %src.2, ptr noalias %dst.1, ptr noalias %dst.2) {
+; CHECK-LABEL: define void @call_forced_scalar(
+; CHECK-SAME: ptr [[SRC_1:%.*]], ptr [[SRC_2:%.*]], ptr noalias [[DST_1:%.*]], ptr noalias [[DST_2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC_1]], align 4
+; CHECK-NEXT:    [[SMAX:%.*]] = tail call i32 @llvm.smax.i32(i32 [[TMP0]], i32 0)
+; CHECK-NEXT:    [[UMIN:%.*]] = tail call i32 @llvm.umin.i32(i32 [[SMAX]], i32 1)
+; CHECK-NEXT:    [[UMIN_EXT:%.*]] = zext i32 [[UMIN]] to i64
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[UMIN_EXT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[L_EXT:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 3, [[L_EXT]]
+; CHECK-NEXT:    store i32 [[MUL]], ptr [[DST_1]], align 4
+; CHECK-NEXT:    [[GEP_DST_2:%.*]] = getelementptr i32, ptr [[DST_2]], i64 [[IV]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP_DST_2]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %0 = load i32, ptr %src.1, align 4
+  %smax = tail call i32 @llvm.smax.i32(i32 %0, i32 0)
+  %umin = tail call i32 @llvm.umin.i32(i32 %smax, i32 1)
+  %umin.ext = zext i32 %umin to i64
+  %gep.src.2 = getelementptr i8, ptr %src.2, i64 %umin.ext
+  %1 = load i8, ptr %gep.src.2, align 1
+  %l.ext = zext i8 %1 to i32
+  %mul = mul i32 3, %l.ext
+  store i32 %mul, ptr %dst.1, align 4
+  %gep.dst.2 = getelementptr i32, ptr %dst.2, i64 %iv
+  store i32 0, ptr %gep.dst.2, align 4
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 0
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare i32 @llvm.smax.i32(i32, i32)
+declare i32 @llvm.umin.i32(i32, i32)
+declare double @llvm.sqrt.f64(double)
 declare double @llvm.powi.f64.i32(double, i32)
 declare i64 @llvm.fshl.i64(i64, i64, i64)
 ;.