[llvm] [AArch64][TTI] Improve `LegalVF` when computing gather-loads cost (PR #69617)

Fri Oct 27 07:26:28 PDT 2023

https://github.com/antoniofrighetto updated https://github.com/llvm/llvm-project/pull/69617

>From 7b5fa1d1702a2922ef5bc4568f9939cae1b211fd Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me at antoniofrighetto.com>
Date: Thu, 19 Oct 2023 19:08:03 +0200
Subject: [PATCH] [AArch64][TTI] Fix `LegalVF` when gather loads are scalarized

After determining the cost of loads that could not be coalesced into
`VectorizedLoads` in SLP, computing the cost of a gather-vectorized
load is carried out. Return an invalid cost when the type of a group
of loads, whose type is a vector of size dependent upon `VF`, may be
legalized into a scalar value.

Fixes: https://github.com/llvm/llvm-project/issues/68953.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  7 +++
 .../CostModel/AArch64/sve-illegal-types.ll    | 35 ++++++++----
 .../SLPVectorizer/AArch64/gather-load-128.ll  | 54 +++++++++++++++++++
 3 files changed, 87 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/AArch64/gather-load-128.ll

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5f2d09f0765aa38..2a9cba89d761edb 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2996,6 +2996,9 @@ static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
 InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
+  if (!isLegalMaskedGatherScatter(DataTy))
+    return InstructionCost::getInvalid();
+
   if (useNeonVector(DataTy))
     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
                                          Alignment, CostKind, I);
@@ -3004,6 +3007,10 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
   if (!LT.first.isValid())
     return InstructionCost::getInvalid();
 
+  if (!LT.second.isVector() ||
+      !isElementTypeLegalForScalableVector(VT->getElementType()))
+    return InstructionCost::getInvalid();
+
   // The code-generator is currently not able to handle scalable vectors
   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
   // it. This change will be removed when code-generation for these types is
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-illegal-types.ll b/llvm/test/Analysis/CostModel/AArch64/sve-illegal-types.ll
index 3e85760c3f53ec3..eb9b94c9be61e33 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-illegal-types.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-illegal-types.ll
@@ -1,14 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sve  < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @load_store(ptr %ptrs) {
 ; CHECK-LABEL: 'load_store'
-; CHECK-NEXT: Invalid cost for instruction: %load1 = load <vscale x 1 x i128>, ptr undef
-; CHECK-NEXT: Invalid cost for instruction: %load2 = load <vscale x 2 x i128>, ptr undef
-; CHECK-NEXT: Invalid cost for instruction: %load3 = load <vscale x 1 x fp128>, ptr undef
-; CHECK-NEXT: Invalid cost for instruction: %load4 = load <vscale x 2 x fp128>, ptr undef
-; CHECK-NEXT: Invalid cost for instruction: store <vscale x 1 x i128> %load1, ptr %ptrs
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %load1 = load <vscale x 1 x i128>, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %load2 = load <vscale x 2 x i128>, ptr undef, align 32
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %load3 = load <vscale x 1 x fp128>, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %load4 = load <vscale x 2 x fp128>, ptr undef, align 32
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: store <vscale x 1 x i128> %load1, ptr %ptrs, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %load1 = load <vscale x 1 x i128>, ptr undef
   %load2 = load <vscale x 2 x i128>, ptr undef
   %load3 = load <vscale x 1 x fp128>, ptr undef
@@ -19,8 +22,10 @@ define void @load_store(ptr %ptrs) {
 
 define void @masked_load_store(ptr %ptrs, ptr %val, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru) {
 ; CHECK-LABEL: 'masked_load_store'
-; CHECK-NEXT: Invalid cost for instruction: %mload = call <vscale x 1 x i128> @llvm.masked.load.nxv1i128.p0(ptr %val, i32 8, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
-; CHECK-NEXT: Invalid cost for instruction: call void @llvm.masked.store.nxv1i128.p0(<vscale x 1 x i128> %mload, ptr %ptrs, i32 8, <vscale x 1 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %mload = call <vscale x 1 x i128> @llvm.masked.load.nxv1i128.p0(ptr %val, i32 8, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.store.nxv1i128.p0(<vscale x 1 x i128> %mload, ptr %ptrs, i32 8, <vscale x 1 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %mload = call <vscale x 1 x i128> @llvm.masked.load.nxv1i128(ptr %val, i32 8, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
   call void @llvm.masked.store.nxv1i128(<vscale x 1 x i128> %mload, ptr %ptrs, i32 8, <vscale x 1 x i1> %mask)
   ret void
@@ -28,13 +33,25 @@ define void @masked_load_store(ptr %ptrs, ptr %val, <vscale x 1 x i1> %mask, <vs
 
 define void @masked_gather_scatter(<vscale x 1 x ptr> %ptrs, <vscale x 1 x ptr> %val, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru) {
 ; CHECK-LABEL: 'masked_gather_scatter'
-; CHECK-NEXT: Invalid cost for instruction: %mgather = call <vscale x 1 x i128> @llvm.masked.gather.nxv1i128.nxv1p0(<vscale x 1 x ptr> %val, i32 0, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
-; CHECK-NEXT: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i128.nxv1p0(<vscale x 1 x i128> %mgather, <vscale x 1 x ptr> %ptrs, i32 0, <vscale x 1 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %mgather = call <vscale x 1 x i128> @llvm.masked.gather.nxv1i128.nxv1p0(<vscale x 1 x ptr> %val, i32 0, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i128.nxv1p0(<vscale x 1 x i128> %mgather, <vscale x 1 x ptr> %ptrs, i32 0, <vscale x 1 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %mgather = call <vscale x 1 x i128> @llvm.masked.gather.nxv1i128(<vscale x 1 x ptr> %val, i32 0, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
   call void @llvm.masked.scatter.nxv1i128(<vscale x 1 x i128> %mgather, <vscale x 1 x ptr> %ptrs, i32 0, <vscale x 1 x i1> %mask)
   ret void
 }
 
+define <2 x i128> @masked_gather_v1i128(<2 x ptr> %ld, <2 x i1> %masks, <2 x i128> %passthru) vscale_range(2,2) {
+; CHECK-LABEL: 'masked_gather_v1i128'
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res
+;
+  %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
+  ret <2 x i128> %res
+}
+
+declare <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i128>)
 declare <vscale x 1 x i128> @llvm.masked.load.nxv1i128(ptr, i32, <vscale x 1 x i1>, <vscale x 1 x i128>)
 declare <vscale x 1 x i128> @llvm.masked.gather.nxv1i128(<vscale x 1 x ptr>, i32, <vscale x 1 x i1>, <vscale x 1 x i128>)
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-load-128.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-load-128.ll
new file mode 100644
index 000000000000000..3f02f974e59e67f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-load-128.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu -mcpu=neoverse-512tvb < %s | FileCheck %s
+
+define void @gather_load_fp128(ptr %arg) #0 {
+; CHECK-LABEL: @gather_load_fp128(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[ARG:%.*]], i64 16
+; CHECK-NEXT:    [[LOAD0:%.*]] = load fp128, ptr [[ARG]], align 1
+; CHECK-NEXT:    [[LOAD1:%.*]] = load fp128, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[LOAD2:%.*]] = load fp128, ptr null, align 1
+; CHECK-NEXT:    [[LOAD3:%.*]] = load fp128, ptr null, align 1
+; CHECK-NEXT:    [[FCMP0:%.*]] = fcmp oeq fp128 [[LOAD0]], 0xL00000000000000000000000000000000
+; CHECK-NEXT:    [[FCMP1:%.*]] = fcmp oeq fp128 [[LOAD1]], 0xL00000000000000000000000000000000
+; CHECK-NEXT:    [[FCMP2:%.*]] = fcmp oeq fp128 [[LOAD2]], 0xL00000000000000000000000000000000
+; CHECK-NEXT:    [[FCMP3:%.*]] = fcmp oeq fp128 [[LOAD3]], 0xL00000000000000000000000000000000
+; CHECK-NEXT:    ret void
+;
+  %gep = getelementptr i8, ptr %arg, i64 16
+  %load0 = load fp128, ptr %arg, align 1
+  %load1 = load fp128, ptr %gep, align 1
+  %load2 = load fp128, ptr null, align 1
+  %load3 = load fp128, ptr null, align 1
+  %fcmp0 = fcmp oeq fp128 %load0, 0xL0
+  %fcmp1 = fcmp oeq fp128 %load1, 0xL0
+  %fcmp2 = fcmp oeq fp128 %load2, 0xL0
+  %fcmp3 = fcmp oeq fp128 %load3, 0xL0
+  ret void
+}
+
+define void @gather_load_i128(ptr %arg) #0 {
+; CHECK-LABEL: @gather_load_i128(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[ARG:%.*]], i64 16
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i128, ptr [[ARG]], align 1
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i128, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i128, ptr null, align 1
+; CHECK-NEXT:    [[LOAD3:%.*]] = load i128, ptr null, align 1
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i128 [[LOAD0]], 0
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i128 [[LOAD1]], 0
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i128 [[LOAD2]], 0
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i128 [[LOAD3]], 0
+; CHECK-NEXT:    ret void
+;
+  %gep = getelementptr i8, ptr %arg, i64 16
+  %load0 = load i128, ptr %arg, align 1
+  %load1 = load i128, ptr %gep, align 1
+  %load2 = load i128, ptr null, align 1
+  %load3 = load i128, ptr null, align 1
+  %cmp0 = icmp eq i128 %load0, 0
+  %cmp1 = icmp eq i128 %load1, 0
+  %cmp2 = icmp eq i128 %load2, 0
+  %cmp3 = icmp eq i128 %load3, 0
+  ret void
+}
+
+attributes #0 = { vscale_range(2,2) }