[llvm] [AArch64][TTI] Improve `LegalVF` when computing gather-loads cost (PR #69617)
Antonio Frighetto via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 27 07:26:28 PDT 2023
https://github.com/antoniofrighetto updated https://github.com/llvm/llvm-project/pull/69617
>From 7b5fa1d1702a2922ef5bc4568f9939cae1b211fd Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me at antoniofrighetto.com>
Date: Thu, 19 Oct 2023 19:08:03 +0200
Subject: [PATCH] [AArch64][TTI] Fix `LegalVF` when gather loads are scalarized
After determining the cost of loads that could not be coalesced into
`VectorizedLoads` in SLP, computing the cost of a gather-vectorized
load is carried out. Return an invalid cost when the type of a group
of loads, whose type is a vector of size dependent upon `VF`, may be
legalized into a scalar value.
Fixes: https://github.com/llvm/llvm-project/issues/68953.
---
.../AArch64/AArch64TargetTransformInfo.cpp | 7 +++
.../CostModel/AArch64/sve-illegal-types.ll | 35 ++++++++----
.../SLPVectorizer/AArch64/gather-load-128.ll | 54 +++++++++++++++++++
3 files changed, 87 insertions(+), 9 deletions(-)
create mode 100644 llvm/test/Transforms/SLPVectorizer/AArch64/gather-load-128.ll
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5f2d09f0765aa38..2a9cba89d761edb 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2996,6 +2996,9 @@ static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
+ if (!isLegalMaskedGatherScatter(DataTy))
+ return InstructionCost::getInvalid();
+
if (useNeonVector(DataTy))
return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
Alignment, CostKind, I);
@@ -3004,6 +3007,10 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
if (!LT.first.isValid())
return InstructionCost::getInvalid();
+ if (!LT.second.isVector() ||
+ !isElementTypeLegalForScalableVector(VT->getElementType()))
+ return InstructionCost::getInvalid();
+
// The code-generator is currently not able to handle scalable vectors
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
// it. This change will be removed when code-generation for these types is
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-illegal-types.ll b/llvm/test/Analysis/CostModel/AArch64/sve-illegal-types.ll
index 3e85760c3f53ec3..eb9b94c9be61e33 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-illegal-types.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-illegal-types.ll
@@ -1,14 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 3
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
define void @load_store(ptr %ptrs) {
; CHECK-LABEL: 'load_store'
-; CHECK-NEXT: Invalid cost for instruction: %load1 = load <vscale x 1 x i128>, ptr undef
-; CHECK-NEXT: Invalid cost for instruction: %load2 = load <vscale x 2 x i128>, ptr undef
-; CHECK-NEXT: Invalid cost for instruction: %load3 = load <vscale x 1 x fp128>, ptr undef
-; CHECK-NEXT: Invalid cost for instruction: %load4 = load <vscale x 2 x fp128>, ptr undef
-; CHECK-NEXT: Invalid cost for instruction: store <vscale x 1 x i128> %load1, ptr %ptrs
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %load1 = load <vscale x 1 x i128>, ptr undef, align 16
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %load2 = load <vscale x 2 x i128>, ptr undef, align 32
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %load3 = load <vscale x 1 x fp128>, ptr undef, align 16
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %load4 = load <vscale x 2 x fp128>, ptr undef, align 32
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: store <vscale x 1 x i128> %load1, ptr %ptrs, align 16
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
%load1 = load <vscale x 1 x i128>, ptr undef
%load2 = load <vscale x 2 x i128>, ptr undef
%load3 = load <vscale x 1 x fp128>, ptr undef
@@ -19,8 +22,10 @@ define void @load_store(ptr %ptrs) {
define void @masked_load_store(ptr %ptrs, ptr %val, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru) {
; CHECK-LABEL: 'masked_load_store'
-; CHECK-NEXT: Invalid cost for instruction: %mload = call <vscale x 1 x i128> @llvm.masked.load.nxv1i128.p0(ptr %val, i32 8, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
-; CHECK-NEXT: Invalid cost for instruction: call void @llvm.masked.store.nxv1i128.p0(<vscale x 1 x i128> %mload, ptr %ptrs, i32 8, <vscale x 1 x i1> %mask)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %mload = call <vscale x 1 x i128> @llvm.masked.load.nxv1i128.p0(ptr %val, i32 8, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.store.nxv1i128.p0(<vscale x 1 x i128> %mload, ptr %ptrs, i32 8, <vscale x 1 x i1> %mask)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
%mload = call <vscale x 1 x i128> @llvm.masked.load.nxv1i128(ptr %val, i32 8, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
call void @llvm.masked.store.nxv1i128(<vscale x 1 x i128> %mload, ptr %ptrs, i32 8, <vscale x 1 x i1> %mask)
ret void
@@ -28,13 +33,25 @@ define void @masked_load_store(ptr %ptrs, ptr %val, <vscale x 1 x i1> %mask, <vs
define void @masked_gather_scatter(<vscale x 1 x ptr> %ptrs, <vscale x 1 x ptr> %val, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru) {
; CHECK-LABEL: 'masked_gather_scatter'
-; CHECK-NEXT: Invalid cost for instruction: %mgather = call <vscale x 1 x i128> @llvm.masked.gather.nxv1i128.nxv1p0(<vscale x 1 x ptr> %val, i32 0, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
-; CHECK-NEXT: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i128.nxv1p0(<vscale x 1 x i128> %mgather, <vscale x 1 x ptr> %ptrs, i32 0, <vscale x 1 x i1> %mask)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %mgather = call <vscale x 1 x i128> @llvm.masked.gather.nxv1i128.nxv1p0(<vscale x 1 x ptr> %val, i32 0, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i128.nxv1p0(<vscale x 1 x i128> %mgather, <vscale x 1 x ptr> %ptrs, i32 0, <vscale x 1 x i1> %mask)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
%mgather = call <vscale x 1 x i128> @llvm.masked.gather.nxv1i128(<vscale x 1 x ptr> %val, i32 0, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
call void @llvm.masked.scatter.nxv1i128(<vscale x 1 x i128> %mgather, <vscale x 1 x ptr> %ptrs, i32 0, <vscale x 1 x i1> %mask)
ret void
}
+define <2 x i128> @masked_gather_v1i128(<2 x ptr> %ld, <2 x i1> %masks, <2 x i128> %passthru) vscale_range(2,2) {
+; CHECK-LABEL: 'masked_gather_v1i128'
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res
+;
+ %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
+ ret <2 x i128> %res
+}
+
+declare <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i128>)
declare <vscale x 1 x i128> @llvm.masked.load.nxv1i128(ptr, i32, <vscale x 1 x i1>, <vscale x 1 x i128>)
declare <vscale x 1 x i128> @llvm.masked.gather.nxv1i128(<vscale x 1 x ptr>, i32, <vscale x 1 x i1>, <vscale x 1 x i128>)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-load-128.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-load-128.ll
new file mode 100644
index 000000000000000..3f02f974e59e67f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-load-128.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu -mcpu=neoverse-512tvb < %s | FileCheck %s
+
+define void @gather_load_fp128(ptr %arg) #0 {
+; CHECK-LABEL: @gather_load_fp128(
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[ARG:%.*]], i64 16
+; CHECK-NEXT: [[LOAD0:%.*]] = load fp128, ptr [[ARG]], align 1
+; CHECK-NEXT: [[LOAD1:%.*]] = load fp128, ptr [[GEP]], align 1
+; CHECK-NEXT: [[LOAD2:%.*]] = load fp128, ptr null, align 1
+; CHECK-NEXT: [[LOAD3:%.*]] = load fp128, ptr null, align 1
+; CHECK-NEXT: [[FCMP0:%.*]] = fcmp oeq fp128 [[LOAD0]], 0xL00000000000000000000000000000000
+; CHECK-NEXT: [[FCMP1:%.*]] = fcmp oeq fp128 [[LOAD1]], 0xL00000000000000000000000000000000
+; CHECK-NEXT: [[FCMP2:%.*]] = fcmp oeq fp128 [[LOAD2]], 0xL00000000000000000000000000000000
+; CHECK-NEXT: [[FCMP3:%.*]] = fcmp oeq fp128 [[LOAD3]], 0xL00000000000000000000000000000000
+; CHECK-NEXT: ret void
+;
+ %gep = getelementptr i8, ptr %arg, i64 16
+ %load0 = load fp128, ptr %arg, align 1
+ %load1 = load fp128, ptr %gep, align 1
+ %load2 = load fp128, ptr null, align 1
+ %load3 = load fp128, ptr null, align 1
+ %fcmp0 = fcmp oeq fp128 %load0, 0xL0
+ %fcmp1 = fcmp oeq fp128 %load1, 0xL0
+ %fcmp2 = fcmp oeq fp128 %load2, 0xL0
+ %fcmp3 = fcmp oeq fp128 %load3, 0xL0
+ ret void
+}
+
+define void @gather_load_i128(ptr %arg) #0 {
+; CHECK-LABEL: @gather_load_i128(
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[ARG:%.*]], i64 16
+; CHECK-NEXT: [[LOAD0:%.*]] = load i128, ptr [[ARG]], align 1
+; CHECK-NEXT: [[LOAD1:%.*]] = load i128, ptr [[GEP]], align 1
+; CHECK-NEXT: [[LOAD2:%.*]] = load i128, ptr null, align 1
+; CHECK-NEXT: [[LOAD3:%.*]] = load i128, ptr null, align 1
+; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i128 [[LOAD0]], 0
+; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i128 [[LOAD1]], 0
+; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i128 [[LOAD2]], 0
+; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i128 [[LOAD3]], 0
+; CHECK-NEXT: ret void
+;
+ %gep = getelementptr i8, ptr %arg, i64 16
+ %load0 = load i128, ptr %arg, align 1
+ %load1 = load i128, ptr %gep, align 1
+ %load2 = load i128, ptr null, align 1
+ %load3 = load i128, ptr null, align 1
+ %cmp0 = icmp eq i128 %load0, 0
+ %cmp1 = icmp eq i128 %load1, 0
+ %cmp2 = icmp eq i128 %load2, 0
+ %cmp3 = icmp eq i128 %load3, 0
+ ret void
+}
+
+attributes #0 = { vscale_range(2,2) }
More information about the llvm-commits
mailing list