[llvm] [InstCombine] Simple store-to-load forwaring between fixed/scalable vectors (PR #124577)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 27 08:02:39 PST 2025
https://github.com/iamlouk created https://github.com/llvm/llvm-project/pull/124577
When storing a scalable vector and the vscale is a compile-time known
constant, do basic store-to-load forwarding through @llvm.vector.insert
calls, even if the loaded vector is fixed-sized instead of scalable.
The @llvm.vector.insert is matched instead of the load itself because it is
invalid to create a temporary insert of a scalable vector (the stored
value) into a fixed-sized vector (the load type).
The usecase is shown in this [godbold
link](https://godbolt.org/z/KT3sMrMbd), which shows that clang generates
IR that matches this pattern when the "arm_sve_vector_bits" attribute is
used:
```c
typedef svfloat32_t svfloat32_fixed_t
__attribute__((arm_sve_vector_bits(512)));
struct svfloat32_wrapped_t {
svfloat32_fixed_t v;
};
static inline svfloat32_wrapped_t
add(svfloat32_wrapped_t a, svfloat32_wrapped_t b) {
return {svadd_f32_x(svptrue_b32(), a.v, b.v)};
}
svfloat32_wrapped_t
foo(svfloat32_wrapped_t a, svfloat32_wrapped_t b) {
// The IR pattern this patch matches is generated for this return:
return add(a, b);
}
```
>From 091c45d903128e20bd087498f5518f198d2a656e Mon Sep 17 00:00:00 2001
From: Lou Knauer <lou.knauer at sipearl.com>
Date: Mon, 27 Jan 2025 18:53:29 +0100
Subject: [PATCH 1/2] [InstCombine] tests for simple store-to-load forwaring
between fixed/scalable vectors
---
.../InstCombine/store-load-vector-insert.ll | 70 +++++++++++++++++++
1 file changed, 70 insertions(+)
create mode 100644 llvm/test/Transforms/InstCombine/store-load-vector-insert.ll
diff --git a/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll b/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll
new file mode 100644
index 00000000000000..1457b8c4391e2a
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+%struct.svfloat32_wrapped_t = type { <16 x float> }
+
+define <vscale x 4 x float> @store_to_vector_load_different_type(<vscale x 4 x float> %.coerce) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @store_to_vector_load_different_type(
+; CHECK-SAME: <vscale x 4 x float> [[DOTCOERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_SVFLOAT32_WRAPPED_T:%.*]], align 64
+; CHECK-NEXT: [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE]], [[DOTCOERCE]]
+; CHECK-NEXT: store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64
+; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP1]], i64 0)
+; CHECK-NEXT: ret <vscale x 4 x float> [[CAST_SCALABLE]]
+;
+entry:
+ %retval = alloca %struct.svfloat32_wrapped_t
+ %0 = fadd <vscale x 4 x float> %.coerce, %.coerce
+ store <vscale x 4 x float> %0, ptr %retval
+ %1 = load <16 x float>, ptr %retval
+ %cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
+ ret <vscale x 4 x float> %cast.scalable
+}
+
+define <vscale x 4 x float> @vscale_not_fixed(<vscale x 4 x float> %.coerce) #1 {
+; CHECK-LABEL: define <vscale x 4 x float> @vscale_not_fixed(
+; CHECK-SAME: <vscale x 4 x float> [[DOTCOERCE:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_SVFLOAT32_WRAPPED_T:%.*]], align 64
+; CHECK-NEXT: [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE]], [[DOTCOERCE]]
+; CHECK-NEXT: store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64
+; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP1]], i64 0)
+; CHECK-NEXT: ret <vscale x 4 x float> [[CAST_SCALABLE]]
+;
+entry:
+ %retval = alloca %struct.svfloat32_wrapped_t
+ %0 = fadd <vscale x 4 x float> %.coerce, %.coerce
+ store <vscale x 4 x float> %0, ptr %retval
+ %1 = load <16 x float>, ptr %retval
+ %cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
+ ret <vscale x 4 x float> %cast.scalable
+}
+
+define <vscale x 4 x float> @sizes_do_not_match(<vscale x 4 x float> %.coerce) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @sizes_do_not_match(
+; CHECK-SAME: <vscale x 4 x float> [[DOTCOERCE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_SVFLOAT32_WRAPPED_T:%.*]], align 64
+; CHECK-NEXT: [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE]], [[DOTCOERCE]]
+; CHECK-NEXT: store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[RETVAL]], align 32
+; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> poison, <8 x float> [[TMP1]], i64 0)
+; CHECK-NEXT: ret <vscale x 4 x float> [[CAST_SCALABLE]]
+;
+entry:
+ %retval = alloca %struct.svfloat32_wrapped_t
+ %0 = fadd <vscale x 4 x float> %.coerce, %.coerce
+ store <vscale x 4 x float> %0, ptr %retval
+ %1 = load <8 x float>, ptr %retval
+ %cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> poison, <8 x float> %1, i64 0)
+ ret <vscale x 4 x float> %cast.scalable
+}
+
+declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float>, <16 x float>, i64 immarg)
+declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float>, <8 x float>, i64 immarg)
+
+attributes #0 = { vscale_range(4,4) }
+attributes #1 = { vscale_range(1,16) }
>From 28e62bf79e12a57d32e9523ff17326f698552bd0 Mon Sep 17 00:00:00 2001
From: Lou Knauer <lou.knauer at sipearl.com>
Date: Mon, 27 Jan 2025 18:55:36 +0100
Subject: [PATCH 2/2] [InstCombine] Simple store-to-load forwaring between
fixed/scalable vectors
When storing a scalable vector and the VScale is a compile-time known
constant, do basic store-to-load forwarding through @llvm.vector.insert
calls, even if the loaded vector is fixed-sized instead of scalable.
The @llvm.vector.insert is matched instead of the load itself because it
is
invalid to create a temporary insert of a scalable vector (the stored
value) into a fixed-sized vector (the load type).
The usecase is shown in this [godbold
link](https://godbolt.org/z/KT3sMrMbd), which shows that clang generates
IR that matches this pattern when the "arm_sve_vector_bits" attribute is
used:
```
typedef svfloat32_t svfloat32_fixed_t
__attribute__((arm_sve_vector_bits(512)));
struct svfloat32_wrapped_t {
svfloat32_fixed_t v;
};
static inline svfloat32_wrapped_t
add(svfloat32_wrapped_t a, svfloat32_wrapped_t b) {
return {svadd_f32_x(svptrue_b32(), a.v, b.v)};
}
svfloat32_wrapped_t
foo(svfloat32_wrapped_t a, svfloat32_wrapped_t b) {
// The IR pattern this patch matches is generated for this return:
return add(a, b);
}
```
---
llvm/include/llvm/Analysis/Loads.h | 4 +++
llvm/lib/Analysis/Loads.cpp | 33 +++++++++++++++----
.../InstCombine/InstCombineCalls.cpp | 33 ++++++++++++++-----
.../InstCombine/store-load-vector-insert.ll | 6 +---
4 files changed, 57 insertions(+), 19 deletions(-)
diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h
index 639070c07897b0..0cadbc5fede9b8 100644
--- a/llvm/include/llvm/Analysis/Loads.h
+++ b/llvm/include/llvm/Analysis/Loads.h
@@ -154,8 +154,12 @@ Value *FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB,
/// FindAvailableLoadedValue() for the case where we are not interested in
/// finding the closest clobbering instruction if no available load is found.
/// This overload cannot be used to scan across multiple blocks.
+/// If \p VectorKindChange is not nullptr, this is a out parameter that is true
+/// if a value was found, but it is a scalable vector instead of a requested
+/// fixed-sized one (or the other way round).
Value *FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
bool *IsLoadCSE,
+ bool *IsVectorKindChange = nullptr,
unsigned MaxInstsToScan = DefMaxInstsToScan);
/// Scan backwards to see if we have the value of the given pointer available
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 691d7e4a3edcff..e4bd59fbf2d300 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -538,7 +538,8 @@ static bool areNonOverlapSameBaseLoadAndStore(const Value *LoadPtr,
static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr,
Type *AccessTy, bool AtLeastAtomic,
- const DataLayout &DL, bool *IsLoadCSE) {
+ const DataLayout &DL, bool *IsLoadCSE,
+ bool *IsVectorKindChange) {
// If this is a load of Ptr, the loaded value is available.
// (This is true even if the load is volatile or atomic, although
// those cases are unlikely.)
@@ -584,6 +585,25 @@ static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr,
if (TypeSize::isKnownLE(LoadSize, StoreSize))
if (auto *C = dyn_cast<Constant>(Val))
return ConstantFoldLoadFromConst(C, AccessTy, DL);
+
+ if (IsVectorKindChange && Val->getType()->isVectorTy() &&
+ AccessTy->isVectorTy()) {
+ auto Attrs = Inst->getFunction()->getAttributes().getFnAttrs();
+ unsigned VScale = Attrs.getVScaleRangeMin();
+ if (Attrs.getVScaleRangeMax() != VScale)
+ return nullptr;
+
+ unsigned FixedStoreSize =
+ (StoreSize.isFixed() ? StoreSize : StoreSize * VScale)
+ .getKnownMinValue();
+ unsigned FixedLoadSize =
+ (LoadSize.isFixed() ? LoadSize : LoadSize * VScale)
+ .getKnownMinValue();
+ if (FixedStoreSize == FixedLoadSize) {
+ *IsVectorKindChange = true;
+ return Val;
+ }
+ }
}
if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
@@ -655,8 +675,8 @@ Value *llvm::findAvailablePtrLoadStore(
--ScanFrom;
- if (Value *Available = getAvailableLoadStore(Inst, StrippedPtr, AccessTy,
- AtLeastAtomic, DL, IsLoadCSE))
+ if (Value *Available = getAvailableLoadStore(
+ Inst, StrippedPtr, AccessTy, AtLeastAtomic, DL, IsLoadCSE, nullptr))
return Available;
// Try to get the store size for the type.
@@ -711,7 +731,7 @@ Value *llvm::findAvailablePtrLoadStore(
}
Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
- bool *IsLoadCSE,
+ bool *IsLoadCSE, bool *IsVectorKindChange,
unsigned MaxInstsToScan) {
const DataLayout &DL = Load->getDataLayout();
Value *StrippedPtr = Load->getPointerOperand()->stripPointerCasts();
@@ -734,8 +754,9 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
if (MaxInstsToScan-- == 0)
return nullptr;
- Available = getAvailableLoadStore(&Inst, StrippedPtr, AccessTy,
- AtLeastAtomic, DL, IsLoadCSE);
+ Available =
+ getAvailableLoadStore(&Inst, StrippedPtr, AccessTy, AtLeastAtomic, DL,
+ IsLoadCSE, IsVectorKindChange);
if (Available)
break;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index f748f78524e0d7..f463fe3e7d504b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3389,17 +3389,34 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
Value *Vec = II->getArgOperand(0);
Value *SubVec = II->getArgOperand(1);
Value *Idx = II->getArgOperand(2);
- auto *DstTy = dyn_cast<FixedVectorType>(II->getType());
- auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
- auto *SubVecTy = dyn_cast<FixedVectorType>(SubVec->getType());
+ auto *DstTy = cast<VectorType>(II->getType());
+ auto *VecTy = cast<VectorType>(Vec->getType());
+ auto *SubVecTy = cast<VectorType>(SubVec->getType());
+ unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
+
+ // Try store-to-load forwarding where the stored value has the same
+ // type as this intrinsic, and the loaded value is the inserted
+ // vector. This has to be done here because a temporary insert of
+ // a scalable vector (the available value) into a fixed-sized one
+ // (the second operand of this intrinisc) cannot be created.
+ if (auto *LI = dyn_cast<LoadInst>(SubVec);
+ LI && IdxN == 0 && DstTy->isScalableTy() && !SubVecTy->isScalableTy()) {
+ bool IsVectorKindChange = false;
+ BatchAAResults BatchAA(*AA);
+ if (Value *AvilVal = FindAvailableLoadedValue(LI, BatchAA, nullptr,
+ &IsVectorKindChange);
+ AvilVal && IsVectorKindChange && AvilVal->getType() == DstTy) {
+ return replaceInstUsesWith(CI, AvilVal);
+ }
+ }
// Only canonicalize if the destination vector, Vec, and SubVec are all
// fixed vectors.
- if (DstTy && VecTy && SubVecTy) {
- unsigned DstNumElts = DstTy->getNumElements();
- unsigned VecNumElts = VecTy->getNumElements();
- unsigned SubVecNumElts = SubVecTy->getNumElements();
- unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
+ if (!DstTy->isScalableTy() && !VecTy->isScalableTy() &&
+ !SubVecTy->isScalableTy()) {
+ unsigned DstNumElts = DstTy->getElementCount().getFixedValue();
+ unsigned VecNumElts = VecTy->getElementCount().getFixedValue();
+ unsigned SubVecNumElts = SubVecTy->getElementCount().getFixedValue();
// An insert that entirely overwrites Vec with SubVec is a nop.
if (VecNumElts == SubVecNumElts)
diff --git a/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll b/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll
index 1457b8c4391e2a..73685fe8c37628 100644
--- a/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll
+++ b/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll
@@ -7,12 +7,8 @@ define <vscale x 4 x float> @store_to_vector_load_different_type(<vscale x 4 x f
; CHECK-LABEL: define <vscale x 4 x float> @store_to_vector_load_different_type(
; CHECK-SAME: <vscale x 4 x float> [[DOTCOERCE:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_SVFLOAT32_WRAPPED_T:%.*]], align 64
; CHECK-NEXT: [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE]], [[DOTCOERCE]]
-; CHECK-NEXT: store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
-; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64
-; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP1]], i64 0)
-; CHECK-NEXT: ret <vscale x 4 x float> [[CAST_SCALABLE]]
+; CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
;
entry:
%retval = alloca %struct.svfloat32_wrapped_t
More information about the llvm-commits
mailing list