[llvm] 37c87d5 - [LV][AArch64] LoopVectorizer allows scalable frem instructions (#76247)

Thu Jan 18 00:32:57 PST 2024

Author: Paschalis Mpeis
Date: 2024-01-18T08:32:53Z
New Revision: 37c87d5689134392ba801a9eb28785f9f0b3e5f7

URL: https://github.com/llvm/llvm-project/commit/37c87d5689134392ba801a9eb28785f9f0b3e5f7
DIFF: https://github.com/llvm/llvm-project/commit/37c87d5689134392ba801a9eb28785f9f0b3e5f7.diff

LOG: [LV][AArch64] LoopVectorizer allows scalable frem instructions (#76247)

LoopVectorizer is aware when a target can replace a scalable frem
instruction with a vector library call for a given VF and it returns the
relevant cost. Otherwise, it returns an invalid cost (as previously).

Add test that check costs on AArch64, when there is no vector library
available and when there is (with and without tail-folding).

NOTE: Invoking CostModel directly (not through LV) would still return
invalid costs.

Added: 
    llvm/test/Analysis/CostModel/AArch64/arith-fp-frem.ll

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index aa5d1bfa57d535..77d54c9d08f4e6 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6949,10 +6949,25 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
 
     SmallVector<const Value *, 4> Operands(I->operand_values());
-    return TTI.getArithmeticInstrCost(
+    auto InstrCost = TTI.getArithmeticInstrCost(
         I->getOpcode(), VectorTy, CostKind,
         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
         Op2Info, Operands, I);
+
+    // Some targets can replace frem with vector library calls.
+    InstructionCost VecCallCost = InstructionCost::getInvalid();
+    if (I->getOpcode() == Instruction::FRem) {
+      LibFunc Func;
+      if (TLI->getLibFunc(I->getOpcode(), I->getType(), Func) &&
+          TLI->isFunctionVectorizable(TLI->getName(Func), VF)) {
+        SmallVector<Type *, 4> OpTypes;
+        for (auto &Op : I->operands())
+          OpTypes.push_back(Op->getType());
+        VecCallCost =
+            TTI.getCallInstrCost(nullptr, VectorTy, OpTypes, CostKind);
+      }
+    }
+    return std::min(InstrCost, VecCallCost);
   }
   case Instruction::FNeg: {
     return TTI.getArithmeticInstrCost(

diff  --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp-frem.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp-frem.ll
new file mode 100644
index 00000000000000..eb2db1596bef7d
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp-frem.ll
@@ -0,0 +1,156 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "estimated cost.*frem" --version 4
+
+; RUN: opt -mattr=+neon -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=NEON-NO-VECLIB
+
+; RUN: opt -mattr=+neon -vector-library=sleefgnuabi -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=NEON-SLEEF
+
+; RUN: opt -mattr=+neon -vector-library=ArmPL -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=NEON-ARMPL
+
+; RUN: opt -mattr=+sve -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-NO-VECLIB
+
+; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-SLEEF
+
+; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-SLEEF-TAILFOLD
+
+; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-ARMPL
+
+; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-ARMPL-TAILFOLD
+
+; REQUIRES: asserts
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @frem_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; NEON-NO-VECLIB-LABEL: 'frem_f64'
+; NEON-NO-VECLIB:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
+; NEON-NO-VECLIB:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+;
+; NEON-SLEEF-LABEL: 'frem_f64'
+; NEON-SLEEF:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
+; NEON-SLEEF:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+;
+; NEON-ARMPL-LABEL: 'frem_f64'
+; NEON-ARMPL:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
+; NEON-ARMPL:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+;
+; SVE-NO-VECLIB-LABEL: 'frem_f64'
+; SVE-NO-VECLIB:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
+; SVE-NO-VECLIB:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
+; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem double %in, %in
+;
+; SVE-SLEEF-LABEL: 'frem_f64'
+; SVE-SLEEF:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
+; SVE-SLEEF:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+; SVE-SLEEF:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
+; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+;
+; SVE-SLEEF-TAILFOLD-LABEL: 'frem_f64'
+; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
+; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
+; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+;
+; SVE-ARMPL-LABEL: 'frem_f64'
+; SVE-ARMPL:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
+; SVE-ARMPL:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+; SVE-ARMPL:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
+; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+;
+; SVE-ARMPL-TAILFOLD-LABEL: 'frem_f64'
+; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
+; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
+; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %res = frem double %in, %in
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %res, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @frem_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; NEON-NO-VECLIB-LABEL: 'frem_f32'
+; NEON-NO-VECLIB:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
+; NEON-NO-VECLIB:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
+; NEON-NO-VECLIB:  LV: Found an estimated cost of 20 for VF 4 For instruction: %res = frem float %in, %in
+;
+; NEON-SLEEF-LABEL: 'frem_f32'
+; NEON-SLEEF:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
+; NEON-SLEEF:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
+; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
+;
+; NEON-ARMPL-LABEL: 'frem_f32'
+; NEON-ARMPL:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
+; NEON-ARMPL:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
+; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
+;
+; SVE-NO-VECLIB-LABEL: 'frem_f32'
+; SVE-NO-VECLIB:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
+; SVE-NO-VECLIB:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
+; SVE-NO-VECLIB:  LV: Found an estimated cost of 20 for VF 4 For instruction: %res = frem float %in, %in
+; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
+; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
+; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 4 For instruction: %res = frem float %in, %in
+;
+; SVE-SLEEF-LABEL: 'frem_f32'
+; SVE-SLEEF:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
+; SVE-SLEEF:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
+; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
+; SVE-SLEEF:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
+; SVE-SLEEF:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
+; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+;
+; SVE-SLEEF-TAILFOLD-LABEL: 'frem_f32'
+; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
+; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
+; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
+; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
+; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
+; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+;
+; SVE-ARMPL-LABEL: 'frem_f32'
+; SVE-ARMPL:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
+; SVE-ARMPL:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
+; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
+; SVE-ARMPL:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
+; SVE-ARMPL:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
+; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+;
+; SVE-ARMPL-TAILFOLD-LABEL: 'frem_f32'
+; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
+; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
+; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
+; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
+; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
+; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %res = frem float %in, %in
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %res, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}