[llvm] [AArch64][TTI] Improve `LegalVF` when computing gather-loads cost (PR #69617)

Thu Oct 19 10:16:00 PDT 2023

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: Antonio Frighetto (antoniofrighetto)

<details>
<summary>Changes</summary>

After determining the cost of loads that could not be coalesced into `VectorizedLoads`, computing the cost of a gather-vectorized load is carried out. Take into account the case where the type of a group of loads, whose type is a vector of size dependent upon `VF`, may be legalized into a scalar value. 

Current number of loads is held in `LT.first`.

Fixes: https://github.com/llvm/llvm-project/issues/68953.

---
Full diff: https://github.com/llvm/llvm-project/pull/69617.diff


2 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+8-1) 
- (added) llvm/test/Transforms/SLPVectorizer/AArch64/gather-load-fp128.ll (+37) 


``````````diff

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index d8a0e68d7123759..40ed3e6d327d882 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2989,7 +2989,14 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
       ElementCount::getScalable(1))
     return InstructionCost::getInvalid();
 
-  ElementCount LegalVF = LT.second.getVectorElementCount();
+  ElementCount LegalVF;
+  if (LT.second.isVector()) {
+    LegalVF = LT.second.getVectorElementCount();
+  } else {
+    // If the legalized type is a simple type, treat it as a 1-element vector.
+    LegalVF = ElementCount::getFixed(1);
+  }
+
   InstructionCost MemOpCost =
       getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
                       {TTI::OK_AnyValue, TTI::OP_None}, I);
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-load-fp128.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-load-fp128.ll
new file mode 100644
index 000000000000000..84de45bde06a62f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-load-fp128.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu -mcpu=neoverse-512tvb -pass-remarks-output=%t.yaml < %s | FileCheck %s
+; RUN: cat %t.yaml | FileCheck -check-prefix=REMARK %s
+
+; REMARK-LABEL: Function:        gather_load_fp128
+; REMARK:       Args:
+; REMARK-NEXT:    - String:          'List vectorization was possible but not beneficial with cost '
+; REMARK-NEXT:    - Cost:            '0'
+; REMARK-NEXT:    - String:          ' >= '
+; REMARK-NEXT:    - Treshold:        '0'
+
+define void @gather_load_fp128(ptr %arg) #0 {
+; CHECK-LABEL: @gather_load_fp128(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[ARG:%.*]], i64 16
+; CHECK-NEXT:    [[LOAD0:%.*]] = load fp128, ptr [[ARG]], align 1
+; CHECK-NEXT:    [[LOAD1:%.*]] = load fp128, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[LOAD2:%.*]] = load fp128, ptr null, align 1
+; CHECK-NEXT:    [[LOAD3:%.*]] = load fp128, ptr null, align 1
+; CHECK-NEXT:    [[FCMP0:%.*]] = fcmp oeq fp128 [[LOAD0]], 0xL00000000000000000000000000000000
+; CHECK-NEXT:    [[FCMP1:%.*]] = fcmp oeq fp128 [[LOAD1]], 0xL00000000000000000000000000000000
+; CHECK-NEXT:    [[FCMP2:%.*]] = fcmp oeq fp128 [[LOAD2]], 0xL00000000000000000000000000000000
+; CHECK-NEXT:    [[FCMP3:%.*]] = fcmp oeq fp128 [[LOAD3]], 0xL00000000000000000000000000000000
+; CHECK-NEXT:    ret void
+;
+  %gep = getelementptr i8, ptr %arg, i64 16
+  %load0 = load fp128, ptr %arg, align 1
+  %load1 = load fp128, ptr %gep, align 1
+  %load2 = load fp128, ptr null, align 1
+  %load3 = load fp128, ptr null, align 1
+  %fcmp0 = fcmp oeq fp128 %load0, 0xL0
+  %fcmp1 = fcmp oeq fp128 %load1, 0xL0
+  %fcmp2 = fcmp oeq fp128 %load2, 0xL0
+  %fcmp3 = fcmp oeq fp128 %load3, 0xL0
+  ret void
+}
+
+attributes #0 = { vscale_range(2,2) }

``````````

</details>


https://github.com/llvm/llvm-project/pull/69617