[llvm] [LV][AArch64] LoopVectorizer allows scalable frem instructions (PR #76247)

Tue Jan 9 09:35:59 PST 2024

https://github.com/paschalis-mpeis updated https://github.com/llvm/llvm-project/pull/76247

>From 92c8ef778ddcdfe2f73ed50c4abb63ac417fce3f Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis at arm.com>
Date: Thu, 21 Dec 2023 09:29:42 +0000
Subject: [PATCH 1/3] [LV][AArch64] LoopVectorizer allows scalable frem
 instructions

In AArch64, when an 'frem' instruction uses scalable vectors, it will be
replaced with a vector library call. LoopVectorize is now aware of that
so it no longer returns invalid costs.

When it is not scalable, it returns the default costs, which are
delegated to the BaseT TTI Implementation.
---
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  1 -
 .../AArch64/AArch64TargetTransformInfo.cpp    | 15 ++++++++++++
 .../CostModel/AArch64/arith-fp-sve.ll         | 23 ++++++++++---------
 .../Analysis/CostModel/AArch64/arith-fp.ll    |  2 +-
 4 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index a5a18a538d7689..73be5db4341093 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -17,7 +17,6 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
-#include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include <optional>
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b5b8b68291786d..0a76c670c68a8b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2902,6 +2902,21 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
     if (!Ty->getScalarType()->isFP128Ty())
       return LT.first;
     [[fallthrough]];
+  case ISD::FREM: {
+    // Scalable frem instructions will be replaced with Vector library calls.
+    if (Ty->isScalableTy()) {
+      SmallVector<Type *, 4> OpTypes;
+      for (auto &Op : CxtI->operands())
+        OpTypes.push_back(Op->getType());
+
+      InstructionCost ScalableCost =
+          getCallInstrCost(nullptr, Ty, OpTypes, CostKind);
+      return ScalableCost;
+    } else {
+      return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
+                                           Op2Info);
+    }
+  }
   case ISD::FMUL:
   case ISD::FDIV:
     // These nodes are marked as 'custom' just to lower them to SVE.
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll
index 18a1c31c03f748..682bb5a58a7846 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll
@@ -1,7 +1,8 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -enable-no-nans-fp-math -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16 -mattr=+sve | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -mattr=+sve -mattr=+fullfp16 -enable-no-nans-fp-math -disable-output -passes="print<cost-model>" %s 2>&1 | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
 
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @fadd() {
 ; CHECK-LABEL: 'fadd'
@@ -137,14 +138,14 @@ define void @fdiv() {
 
 define void @frem() {
 ; CHECK-LABEL: 'frem'
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F16 = frem <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8F16 = frem <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V16F16 = frem <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2F32 = frem <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F32 = frem <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8F32 = frem <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2F64 = frem <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F64 = frem <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F16 = frem <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F16 = frem <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16F16 = frem <vscale x 16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = frem <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F32 = frem <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = frem <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = frem <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = frem <vscale x 4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V4F16 = frem <vscale x 4 x half> undef, undef
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
index c352892354fc24..403ee8e861387e 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -enable-no-nans-fp-math  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s
 
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
 
 define i32 @fadd(i32 %arg) {
 ; CHECK-LABEL: 'fadd'

>From d872bc43c4614be5f77625479b47eadfbe2d35ba Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis at arm.com>
Date: Wed, 3 Jan 2024 14:34:19 +0000
Subject: [PATCH 2/3] [LV][AArch64] LoopVectorizer allows scalable frem
 instructions

LoopVectorizer is aware when a target can replace a scalable frem
instruction with a vector library call and it returns the relevant cost.
Otherwise, it returns an invalid cost (as previously).

Add test that check costs on AArch64, when there is no vector library
available and when there is, with and without tail-folding.

NOTE: Invoking CostModel directly (not through LV) would still return
invalid costs. To avoid this, it would require passing TargetLibraryInfo
to TargetTransformInfo API, which we preferred not to as the 'frem'
instruction is an isolated case.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 15 ---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 13 +++
 .../CostModel/AArch64/arith-fp-sve-frem.ll    | 98 +++++++++++++++++++
 .../CostModel/AArch64/arith-fp-sve.ll         | 16 +--
 4 files changed, 119 insertions(+), 23 deletions(-)
 create mode 100644 llvm/test/Analysis/CostModel/AArch64/arith-fp-sve-frem.ll

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0a76c670c68a8b..b5b8b68291786d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2902,21 +2902,6 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
     if (!Ty->getScalarType()->isFP128Ty())
       return LT.first;
     [[fallthrough]];
-  case ISD::FREM: {
-    // Scalable frem instructions will be replaced with Vector library calls.
-    if (Ty->isScalableTy()) {
-      SmallVector<Type *, 4> OpTypes;
-      for (auto &Op : CxtI->operands())
-        OpTypes.push_back(Op->getType());
-
-      InstructionCost ScalableCost =
-          getCallInstrCost(nullptr, Ty, OpTypes, CostKind);
-      return ScalableCost;
-    } else {
-      return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
-                                           Op2Info);
-    }
-  }
   case ISD::FMUL:
   case ISD::FDIV:
     // These nodes are marked as 'custom' just to lower them to SVE.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 51ce88480c0880..1a0411f29770f2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6950,6 +6950,19 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
         Legal->isInvariant(Op2))
       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
 
+    // Some targets replace frem with vector library calls.
+    if (I->getOpcode() == Instruction::FRem && VectorTy->isScalableTy()) {
+      LibFunc Func;
+      if (TLI->getLibFunc(I->getOpcode(), I->getType(), Func)) {
+        if (TLI->isFunctionVectorizable(TLI->getName(Func))) {
+          SmallVector<Type *, 4> OpTypes;
+          for (auto &Op : I->operands())
+            OpTypes.push_back(Op->getType());
+          return TTI.getCallInstrCost(nullptr, VectorTy, OpTypes, CostKind);
+        }
+      }
+    }
+
     SmallVector<const Value *, 4> Operands(I->operand_values());
     return TTI.getArithmeticInstrCost(
         I->getOpcode(), VectorTy, CostKind,
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve-frem.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve-frem.ll
new file mode 100644
index 00000000000000..70ec986081f551
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve-frem.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "cost.*vscale.*frem" --version 4
+
+; RUN: opt -mattr=+sve -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=NO-VECLIB
+
+; RUN: opt -vector-library=sleefgnuabi -mattr=+sve -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SLEEF
+
+; RUN: opt -vector-library=sleefgnuabi -mattr=+sve -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SLEEF-TAILFOLD
+
+; RUN: opt -vector-library=ArmPL -mattr=+sve -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=ARMPL
+
+; RUN: opt -vector-library=ArmPL -mattr=+sve -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=ARMPL-TAILFOLD
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @frem_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; NO-VECLIB-LABEL: 'frem_f64'
+; NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
+; NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem double %in, %in
+; NO-VECLIB:  LV: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): frem %res = frem double %in, %in
+;
+; SLEEF-LABEL: 'frem_f64'
+; SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 1 For instruction: %res = frem double %in, %in
+; SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+;
+; SLEEF-TAILFOLD-LABEL: 'frem_f64'
+; SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 1 For instruction: %res = frem double %in, %in
+; SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+;
+; ARMPL-LABEL: 'frem_f64'
+; ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 1 For instruction: %res = frem double %in, %in
+; ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+;
+; ARMPL-TAILFOLD-LABEL: 'frem_f64'
+; ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 1 For instruction: %res = frem double %in, %in
+; ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %res = frem double %in, %in
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %res, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @frem_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; NO-VECLIB-LABEL: 'frem_f32'
+; NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
+; NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
+; NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 4 For instruction: %res = frem float %in, %in
+; NO-VECLIB:  LV: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2, vscale x 4): frem %res = frem float %in, %in
+;
+; SLEEF-LABEL: 'frem_f32'
+; SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 1 For instruction: %res = frem float %in, %in
+; SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem float %in, %in
+; SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+;
+; SLEEF-TAILFOLD-LABEL: 'frem_f32'
+; SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 1 For instruction: %res = frem float %in, %in
+; SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem float %in, %in
+; SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+;
+; ARMPL-LABEL: 'frem_f32'
+; ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 1 For instruction: %res = frem float %in, %in
+; ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem float %in, %in
+; ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+;
+; ARMPL-TAILFOLD-LABEL: 'frem_f32'
+; ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 1 For instruction: %res = frem float %in, %in
+; ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem float %in, %in
+; ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %res = frem float %in, %in
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %res, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll
index 682bb5a58a7846..e61ffd186b74a6 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll
@@ -138,14 +138,14 @@ define void @fdiv() {
 
 define void @frem() {
 ; CHECK-LABEL: 'frem'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F16 = frem <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F16 = frem <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16F16 = frem <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = frem <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F32 = frem <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = frem <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = frem <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = frem <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F16 = frem <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8F16 = frem <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V16F16 = frem <vscale x 16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2F32 = frem <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F32 = frem <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8F32 = frem <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2F64 = frem <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F64 = frem <vscale x 4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V4F16 = frem <vscale x 4 x half> undef, undef

>From ca9f165122ad5d6c061ac9748e1dde49afa9669e Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis at arm.com>
Date: Wed, 3 Jan 2024 14:34:19 +0000
Subject: [PATCH 3/3] Addressing reviewers.

Comparing against the default cost, and no longer restricting to
scalable types.
---
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  1 +
 .../Transforms/Vectorize/LoopVectorize.cpp    | 19 +++++++++++--------
 .../CostModel/AArch64/arith-fp-sve.ll         |  7 +++----
 .../Analysis/CostModel/AArch64/arith-fp.ll    |  2 +-
 4 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 73be5db4341093..a5a18a538d7689 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include <optional>
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1a0411f29770f2..63b2028d52ef00 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6950,24 +6950,27 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
         Legal->isInvariant(Op2))
       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
 
+    SmallVector<const Value *, 4> Operands(I->operand_values());
+    auto InstrCost = TTI.getArithmeticInstrCost(
+        I->getOpcode(), VectorTy, CostKind,
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+        Op2Info, Operands, I);
+
     // Some targets replace frem with vector library calls.
-    if (I->getOpcode() == Instruction::FRem && VectorTy->isScalableTy()) {
+    if (I->getOpcode() == Instruction::FRem) {
       LibFunc Func;
       if (TLI->getLibFunc(I->getOpcode(), I->getType(), Func)) {
         if (TLI->isFunctionVectorizable(TLI->getName(Func))) {
           SmallVector<Type *, 4> OpTypes;
           for (auto &Op : I->operands())
             OpTypes.push_back(Op->getType());
-          return TTI.getCallInstrCost(nullptr, VectorTy, OpTypes, CostKind);
+          auto CallCost =
+              TTI.getCallInstrCost(nullptr, VectorTy, OpTypes, CostKind);
+          return std::min(InstrCost, CallCost);
         }
       }
     }
-
-    SmallVector<const Value *, 4> Operands(I->operand_values());
-    return TTI.getArithmeticInstrCost(
-        I->getOpcode(), VectorTy, CostKind,
-        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
-        Op2Info, Operands, I);
+    return InstrCost;
   }
   case Instruction::FNeg: {
     return TTI.getArithmeticInstrCost(
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll
index e61ffd186b74a6..18a1c31c03f748 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll
@@ -1,8 +1,7 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -mattr=+sve -mattr=+fullfp16 -enable-no-nans-fp-math -disable-output -passes="print<cost-model>" %s 2>&1 | FileCheck %s
-
-target triple = "aarch64-unknown-linux-gnu"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -enable-no-nans-fp-math -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16 -mattr=+sve | FileCheck %s
 
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @fadd() {
 ; CHECK-LABEL: 'fadd'
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
index 403ee8e861387e..c352892354fc24 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -enable-no-nans-fp-math  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s
 
-target triple = "aarch64-unknown-linux-gnu"
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define i32 @fadd(i32 %arg) {
 ; CHECK-LABEL: 'fadd'