[llvm] [LV][AArch64] LoopVectorizer allows scalable frem instructions (PR #76247)

Thu Jan 4 03:36:02 PST 2024

https://github.com/paschalis-mpeis updated https://github.com/llvm/llvm-project/pull/76247

>From 63e8f68d5ec9fccef81a56c14cde72f1c6b3d99f Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis at arm.com>
Date: Thu, 21 Dec 2023 09:29:42 +0000
Subject: [PATCH 1/2] [LV][AArch64] LoopVectorizer allows scalable frem
 instructions

In AArch64, when an 'frem' instruction uses scalable vectors, it will be
replaced with a vector library call. LoopVectorize is now aware of that
so it no longer returns invalid costs.

When it is not scalable, it returns the default costs, which are
delegated to the BaseT TTI Implementation.
---
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  1 -
 .../AArch64/AArch64TargetTransformInfo.cpp    | 15 ++++++++++++
 .../CostModel/AArch64/arith-fp-sve.ll         | 23 ++++++++++---------
 .../Analysis/CostModel/AArch64/arith-fp.ll    |  2 +-
 4 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 67246afa23147a..7212fdcca9695b 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -17,7 +17,6 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
-#include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include <optional>
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b5b8b68291786d..0a76c670c68a8b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2902,6 +2902,21 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
     if (!Ty->getScalarType()->isFP128Ty())
       return LT.first;
     [[fallthrough]];
+  case ISD::FREM: {
+    // Scalable frem instructions will be replaced with Vector library calls.
+    if (Ty->isScalableTy()) {
+      SmallVector<Type *, 4> OpTypes;
+      for (auto &Op : CxtI->operands())
+        OpTypes.push_back(Op->getType());
+
+      InstructionCost ScalableCost =
+          getCallInstrCost(nullptr, Ty, OpTypes, CostKind);
+      return ScalableCost;
+    } else {
+      return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
+                                           Op2Info);
+    }
+  }
   case ISD::FMUL:
   case ISD::FDIV:
     // These nodes are marked as 'custom' just to lower them to SVE.
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll
index 18a1c31c03f748..682bb5a58a7846 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll
@@ -1,7 +1,8 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -enable-no-nans-fp-math -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16 -mattr=+sve | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -mattr=+sve -mattr=+fullfp16 -enable-no-nans-fp-math -disable-output -passes="print<cost-model>" %s 2>&1 | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
 
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @fadd() {
 ; CHECK-LABEL: 'fadd'
@@ -137,14 +138,14 @@ define void @fdiv() {
 
 define void @frem() {
 ; CHECK-LABEL: 'frem'
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F16 = frem <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8F16 = frem <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V16F16 = frem <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2F32 = frem <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F32 = frem <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8F32 = frem <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2F64 = frem <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F64 = frem <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F16 = frem <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F16 = frem <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16F16 = frem <vscale x 16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = frem <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F32 = frem <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = frem <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = frem <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = frem <vscale x 4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V4F16 = frem <vscale x 4 x half> undef, undef
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
index c352892354fc24..403ee8e861387e 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -enable-no-nans-fp-math  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s
 
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
 
 define i32 @fadd(i32 %arg) {
 ; CHECK-LABEL: 'fadd'

>From a041258a20f9b7d0bb165e03e1d317ce95024b8c Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis at arm.com>
Date: Wed, 3 Jan 2024 14:34:19 +0000
Subject: [PATCH 2/2] [LV][AArch64] LoopVectorizer allows scalable frem
 instructions

LoopVectorizer is aware when a target can replace a scalable frem
instruction with a vector library call and it returns the relevant cost.
Otherwise, it returns an invalid cost (as previously).

Add test that check costs on AArch64, when there is no vector library
available and when there is, with and without tail-folding.

NOTE: Invoking CostModel directly (not through LV) would still return
invalid costs. To avoid this, it would require passing TargetLibraryInfo
to TargetTransformInfo API, which we preferred not to as the 'frem'
instruction is an isolated case.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 15 ---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 13 +++
 .../CostModel/AArch64/arith-fp-sve-frem.ll    | 98 +++++++++++++++++++
 .../CostModel/AArch64/arith-fp-sve.ll         | 16 +--
 4 files changed, 119 insertions(+), 23 deletions(-)
 create mode 100644 llvm/test/Analysis/CostModel/AArch64/arith-fp-sve-frem.ll

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0a76c670c68a8b..b5b8b68291786d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2902,21 +2902,6 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
     if (!Ty->getScalarType()->isFP128Ty())
       return LT.first;
     [[fallthrough]];
-  case ISD::FREM: {
-    // Scalable frem instructions will be replaced with Vector library calls.
-    if (Ty->isScalableTy()) {
-      SmallVector<Type *, 4> OpTypes;
-      for (auto &Op : CxtI->operands())
-        OpTypes.push_back(Op->getType());
-
-      InstructionCost ScalableCost =
-          getCallInstrCost(nullptr, Ty, OpTypes, CostKind);
-      return ScalableCost;
-    } else {
-      return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
-                                           Op2Info);
-    }
-  }
   case ISD::FMUL:
   case ISD::FDIV:
     // These nodes are marked as 'custom' just to lower them to SVE.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 961d3d3bb1931d..f9d0e056adb77f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7142,6 +7142,19 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
         Legal->isInvariant(Op2))
       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
 
+    // Some targets replace frem with vector library calls.
+    if (I->getOpcode() == Instruction::FRem && VectorTy->isScalableTy()) {
+      LibFunc Func;
+      if (TLI->getLibFunc(I->getOpcode(), I->getType(), Func)) {
+        if (TLI->isFunctionVectorizable(TLI->getName(Func))) {
+          SmallVector<Type *, 4> OpTypes;
+          for (auto &Op : I->operands())
+            OpTypes.push_back(Op->getType());
+          return TTI.getCallInstrCost(nullptr, VectorTy, OpTypes, CostKind);
+        }
+      }
+    }
+
     SmallVector<const Value *, 4> Operands(I->operand_values());
     return TTI.getArithmeticInstrCost(
         I->getOpcode(), VectorTy, CostKind,
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve-frem.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve-frem.ll
new file mode 100644
index 00000000000000..70ec986081f551
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve-frem.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "cost.*vscale.*frem" --version 4
+
+; RUN: opt -mattr=+sve -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=NO-VECLIB
+
+; RUN: opt -vector-library=sleefgnuabi -mattr=+sve -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SLEEF
+
+; RUN: opt -vector-library=sleefgnuabi -mattr=+sve -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SLEEF-TAILFOLD
+
+; RUN: opt -vector-library=ArmPL -mattr=+sve -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=ARMPL
+
+; RUN: opt -vector-library=ArmPL -mattr=+sve -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=ARMPL-TAILFOLD
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @frem_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; NO-VECLIB-LABEL: 'frem_f64'
+; NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
+; NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem double %in, %in
+; NO-VECLIB:  LV: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): frem %res = frem double %in, %in
+;
+; SLEEF-LABEL: 'frem_f64'
+; SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 1 For instruction: %res = frem double %in, %in
+; SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+;
+; SLEEF-TAILFOLD-LABEL: 'frem_f64'
+; SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 1 For instruction: %res = frem double %in, %in
+; SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+;
+; ARMPL-LABEL: 'frem_f64'
+; ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 1 For instruction: %res = frem double %in, %in
+; ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+;
+; ARMPL-TAILFOLD-LABEL: 'frem_f64'
+; ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 1 For instruction: %res = frem double %in, %in
+; ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %res = frem double %in, %in
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %res, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @frem_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; NO-VECLIB-LABEL: 'frem_f32'
+; NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
+; NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
+; NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 4 For instruction: %res = frem float %in, %in
+; NO-VECLIB:  LV: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2, vscale x 4): frem %res = frem float %in, %in
+;
+; SLEEF-LABEL: 'frem_f32'
+; SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 1 For instruction: %res = frem float %in, %in
+; SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem float %in, %in
+; SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+;
+; SLEEF-TAILFOLD-LABEL: 'frem_f32'
+; SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 1 For instruction: %res = frem float %in, %in
+; SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem float %in, %in
+; SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+;
+; ARMPL-LABEL: 'frem_f32'
+; ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 1 For instruction: %res = frem float %in, %in
+; ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem float %in, %in
+; ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+;
+; ARMPL-TAILFOLD-LABEL: 'frem_f32'
+; ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 1 For instruction: %res = frem float %in, %in
+; ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem float %in, %in
+; ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %res = frem float %in, %in
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %res, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll
index 682bb5a58a7846..e61ffd186b74a6 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll
@@ -138,14 +138,14 @@ define void @fdiv() {
 
 define void @frem() {
 ; CHECK-LABEL: 'frem'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F16 = frem <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F16 = frem <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16F16 = frem <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = frem <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F32 = frem <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = frem <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = frem <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = frem <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F16 = frem <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8F16 = frem <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V16F16 = frem <vscale x 16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2F32 = frem <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F32 = frem <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8F32 = frem <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2F64 = frem <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F64 = frem <vscale x 4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V4F16 = frem <vscale x 4 x half> undef, undef