[llvm] [TTI][X86] getMemoryOpCost - reduced costs when loading uniform values due to value reuse (PR #118642)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 4 06:12:38 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
Similar to what we do for broadcast shuffles, when legalising load costs, if the value is known to be uniform, then we will only load a single vector and reuse this across the split legalised registers.
Fixes #<!-- -->111126
---
Full diff: https://github.com/llvm/llvm-project/pull/118642.diff
2 Files Affected:
- (modified) llvm/lib/Target/X86/X86TargetTransformInfo.cpp (+17-12)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/store-constant.ll (+11-34)
``````````diff
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 3b424bbb53e5b1..cae72c4210352e 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -5237,6 +5237,23 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
CurrOpSizeBytes != 1)
break; // Try smalled vector size.
+ // This isn't exactly right. We're using slow unaligned 32-byte accesses
+ // as a proxy for a double-pumped AVX memory interface such as on
+ // Sandybridge.
+ // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
+ // will be scalarized.
+ if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
+ Cost += 2;
+ else if (CurrOpSizeBytes < 4)
+ Cost += 2;
+ else
+ Cost += 1;
+
+ // If we're loading a uniform value, then we don't need to split the load,
+ // loading just a single (widest) vector can be reused by all splits.
+ if (IsLoad && OpInfo.isUniform())
+ return Cost;
+
bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
// If we have fully processed the previous reg, we need to replenish it.
@@ -5265,18 +5282,6 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
!IsLoad, CostKind);
}
- // This isn't exactly right. We're using slow unaligned 32-byte accesses
- // as a proxy for a double-pumped AVX memory interface such as on
- // Sandybridge.
- // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
- // will be scalarized.
- if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
- Cost += 2;
- else if (CurrOpSizeBytes < 4)
- Cost += 2;
- else
- Cost += 1;
-
SubVecEltsLeft -= CurrNumEltPerOp;
NumEltRemaining -= CurrNumEltPerOp;
Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/store-constant.ll b/llvm/test/Transforms/SLPVectorizer/X86/store-constant.ll
index 15c878daff26b7..0b5e279dea5ba6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/store-constant.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/store-constant.ll
@@ -1,42 +1,19 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64 | FileCheck %s
+; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v2 | FileCheck %s
+; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v3 | FileCheck %s
+; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v4 | FileCheck %s
@arr = global [20 x i64] zeroinitializer, align 16
define void @PR111126() {
-; SSE-LABEL: @PR111126(
-; SSE-NEXT: store i64 1, ptr @arr, align 16
-; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 8), align 8
-; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 16), align 16
-; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 24), align 8
-; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 32), align 16
-; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 40), align 8
-; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 48), align 16
-; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 56), align 8
-; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 64), align 16
-; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 72), align 8
-; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 80), align 16
-; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 88), align 8
-; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 96), align 16
-; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 104), align 8
-; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 112), align 16
-; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 120), align 8
-; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 128), align 16
-; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 136), align 8
-; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 144), align 16
-; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 152), align 8
-; SSE-NEXT: ret void
-;
-; AVX-LABEL: @PR111126(
-; AVX-NEXT: store <4 x i64> splat (i64 1), ptr @arr, align 16
-; AVX-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 32), align 16
-; AVX-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 64), align 16
-; AVX-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 96), align 16
-; AVX-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 128), align 16
-; AVX-NEXT: ret void
+; CHECK-LABEL: @PR111126(
+; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr @arr, align 16
+; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 32), align 16
+; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 64), align 16
+; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 96), align 16
+; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 128), align 16
+; CHECK-NEXT: ret void
;
store i64 1, ptr @arr, align 16
store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 8), align 8
``````````
</details>
https://github.com/llvm/llvm-project/pull/118642
More information about the llvm-commits
mailing list