[PATCH] D107651: [AArch64][SVE] Teach cost model that masked loads/stores are cheap

Tue Aug 10 05:29:30 PDT 2021

MattDevereau updated this revision to Diff 365445.
MattDevereau added a comment.

Changed the cost model by keeping the scalarised NEON costs for 128bit width vectors, but use the SVE costs for larger VLS sizes. Added a new regression test to assert the cost-model estimates depending on VLS width


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D107651/new/

https://reviews.llvm.org/D107651

Files:
  llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
  llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll


Index: llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll
===================================================================

--- /dev/null
+++ llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -analyze -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=256 | FileCheck %s -D#VBITS=256
+; RUN: opt < %s -cost-model -analyze -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=384 | FileCheck %s -D#VBITS=256
+; RUN: opt < %s -cost-model -analyze -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=512 | FileCheck %s -D#VBITS=512
+; RUN: opt < %s -cost-model -analyze -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=640 | FileCheck %s -D#VBITS=512
+; RUN: opt < %s -cost-model -analyze -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=768 | FileCheck %s -D#VBITS=512
+; RUN: opt < %s -cost-model -analyze -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=896 | FileCheck %s -D#VBITS=512
+; RUN: opt < %s -cost-model -analyze -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=1024 | FileCheck %s -D#VBITS=1024
+; RUN: opt < %s -cost-model -analyze -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=1152 | FileCheck %s -D#VBITS=1024
+; RUN: opt < %s -cost-model -analyze -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=1280 | FileCheck %s -D#VBITS=1024
+; RUN: opt < %s -cost-model -analyze -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=1408 | FileCheck %s -D#VBITS=1024
+; RUN: opt < %s -cost-model -analyze -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=1536 | FileCheck %s -D#VBITS=1024
+; RUN: opt < %s -cost-model -analyze -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=1664 | FileCheck %s -D#VBITS=1024
+; RUN: opt < %s -cost-model -analyze -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=1792 | FileCheck %s -D#VBITS=1024
+; RUN: opt < %s -cost-model -analyze -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=1920 | FileCheck %s -D#VBITS=1024
+
+define void @fixed-sve-vls() {
+; CHECK-LABEL: 'fixed-sve-vls'
+; CHECK:  Cost Model: Found an estimated cost of [[#mul(div(8191,VBITS)+1,2)]] for instruction: %v1024i1 = call <1024 x i1> @llvm.masked.load.v1024i1.p0v1024i1(<1024 x i1>* undef, i32 8, <1024 x i1> undef, <1024 x i1> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#mul(div(2047,VBITS)+1,2)]] for instruction: %v256i8 = call <256 x i8> @llvm.masked.load.v256i8.p0v256i8(<256 x i8>* undef, i32 8, <256 x i1> undef, <256 x i8> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#mul(div(4091,VBITS)+1,2)]] for instruction: %v256i16 = call <256 x i16> @llvm.masked.load.v256i16.p0v256i16(<256 x i16>* undef, i32 8, <256 x i1> undef, <256 x i16> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#mul(div(511,VBITS)+1,2)]] for instruction: %v16i32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 8, <16 x i1> undef, <16 x i32> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#mul(div(1023,VBITS)+1,2)]] for instruction: %v16i64 = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* undef, i32 8, <16 x i1> undef, <16 x i64> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#mul(div(8191,VBITS)+1,2)]] for instruction: %v512f16 = call <512 x half> @llvm.masked.load.v512f16.p0v512f16(<512 x half>* undef, i32 8, <512 x i1> undef, <512 x half> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#mul(div(8191,VBITS)+1,2)]] for instruction: %v256f32 = call <256 x float> @llvm.masked.load.v256f32.p0v256f32(<256 x float>* undef, i32 8, <256 x i1> undef, <256 x float> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#mul(div(8191,VBITS)+1,2)]] for instruction: %v128f64 = call <128 x double> @llvm.masked.load.v128f64.p0v128f64(<128 x double>* undef, i32 8, <128 x i1> undef, <128 x double> undef)
+; CHECK:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+entry:
+  %v1024i1 = call <1024 x i1> @llvm.masked.load.v1024i1.p0v1024i1(<1024 x i1> *undef, i32 8, <1024 x i1> undef, <1024 x i1> undef)
+  %v256i8 = call <256 x i8> @llvm.masked.load.v256i8.p0v256i8(<256 x i8> *undef, i32 8, <256 x i1> undef, <256 x i8> undef)
+  %v256i16 = call <256 x i16> @llvm.masked.load.v256i16.p0v256i16(<256 x i16> *undef, i32 8, <256 x i1> undef, <256 x i16> undef)
+  %v16i32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32> *undef, i32 8, <16 x i1> undef, <16 x i32> undef)
+  %v16i64 = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64> *undef, i32 8, <16 x i1> undef, <16 x i64> undef)
+
+  %v512f16 = call <512 x half> @llvm.masked.load.v512f16.p0v512f16(<512 x half> *undef, i32 8, <512 x i1> undef, <512 x half> undef)
+  %v256f32 = call <256 x float> @llvm.masked.load.v256f32.p0v256f32(<256 x float> *undef, i32 8, <256 x i1> undef, <256 x float> undef)
+  %v128f64 = call <128 x double> @llvm.masked.load.v128f64.p0v128f64(<128 x double> *undef, i32 8, <128 x i1> undef, <128 x double> undef)
+
+  ret void
+}
+
+declare <1024 x i1> @llvm.masked.load.v1024i1.p0v1024i1(<1024 x i1>*, i32, <1024 x i1>, <1024 x i1>)
+declare <256 x i8> @llvm.masked.load.v256i8.p0v256i8(<256 x i8>*, i32, <256 x i1>, <256 x i8>)
+declare <256 x i16> @llvm.masked.load.v256i16.p0v256i16(<256 x i16>*, i32, <256 x i1>, <256 x i16>)
+declare <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
+declare <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>*, i32, <16 x i1>, <16 x i64>)
+
+declare <512 x half> @llvm.masked.load.v512f16.p0v512f16(<512 x half>*, i32, <512 x i1>, <512 x half>)
+declare <256 x float> @llvm.masked.load.v256f32.p0v256f32(<256 x float>*, i32, <256 x i1>, <256 x float>)
+declare <128 x double> @llvm.masked.load.v128f64.p0v128f64(<128 x double>*, i32, <128 x i1>, <128 x double>)
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1502,7 +1502,7 @@
 AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                       Align Alignment, unsigned AddressSpace,
                                       TTI::TargetCostKind CostKind) {
-  if (!isa<ScalableVectorType>(Src))
+  if (!isa<ScalableVectorType>(Src) && !ST->useSVEForFixedLengthVectors())
     return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                         CostKind);
   auto LT = TLI->getTypeLegalizationCost(DL, Src);


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D107651.365445.patch
Type: text/x-patch
Size: 6867 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20210810/2048f1e3/attachment.bin>