[llvm] c75b251 - [GVN] Load-store forwaring of scalable store to fixed load. (#124748)

Thu Jan 30 02:49:45 PST 2025

Author: Lou
Date: 2025-01-30T11:49:42+01:00
New Revision: c75b251103a6acd785e6f43f640d7a08bd259c41

URL: https://github.com/llvm/llvm-project/commit/c75b251103a6acd785e6f43f640d7a08bd259c41
DIFF: https://github.com/llvm/llvm-project/commit/c75b251103a6acd785e6f43f640d7a08bd259c41.diff

LOG: [GVN] Load-store forwaring of scalable store to fixed load. (#124748)

When storing a scalable vector and loading a fixed-size vector, where
the
scalable vector is known to be larger based on vscale_range, perform
store-to-load forwarding through temporary @llvm.vector.extract calls.
InstCombine then folds the insert/extract pair away.

The usecase is shown in https://godbolt.org/z/KT3sMrMbd, which shows
that clang generates IR that matches this pattern when the
"arm_sve_vector_bits" attribute is used:

```c
typedef svfloat32_t svfloat32_fixed_t
__attribute__((arm_sve_vector_bits(512)));

struct svfloat32_wrapped_t {
  svfloat32_fixed_t v;
};

static inline svfloat32_wrapped_t
add(svfloat32_wrapped_t a, svfloat32_wrapped_t b) {
  return {svadd_f32_x(svptrue_b32(), a.v, b.v)};
}

svfloat32_wrapped_t
foo(svfloat32_wrapped_t a, svfloat32_wrapped_t b) {
  // The IR pattern this patch matches is generated for this return:
  return add(a, b);
}
```

Added: 
    

Modified: 
    llvm/include/llvm/Transforms/Utils/VNCoercion.h
    llvm/lib/Transforms/Scalar/GVN.cpp
    llvm/lib/Transforms/Utils/VNCoercion.cpp
    llvm/test/Transforms/GVN/vscale.ll
    llvm/test/Transforms/NewGVN/vscale.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Transforms/Utils/VNCoercion.h b/llvm/include/llvm/Transforms/Utils/VNCoercion.h
index f1ea94bf60fcc6..ed4dbad50ee853 100644

--- a/llvm/include/llvm/Transforms/Utils/VNCoercion.h
+++ b/llvm/include/llvm/Transforms/Utils/VNCoercion.h
@@ -23,6 +23,7 @@
 
 namespace llvm {
 class Constant;
+class Function;
 class StoreInst;
 class LoadInst;
 class MemIntrinsic;
@@ -35,7 +36,7 @@ namespace VNCoercion {
 /// Return true if CoerceAvailableValueToLoadType would succeed if it was
 /// called.
 bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
-                                     const DataLayout &DL);
+                                     Function *F);
 
 /// If we saw a store of a value to memory, and then a load from a must-aliased
 /// pointer of a 
diff erent type, try to coerce the stored value to the loaded
@@ -44,7 +45,7 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
 ///
 /// If we can't do it, return null.
 Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
-                                      IRBuilderBase &IRB, const DataLayout &DL);
+                                      IRBuilderBase &IRB, Function *F);
 
 /// This function determines whether a value for the pointer LoadPtr can be
 /// extracted from the store at DepSI.
@@ -75,7 +76,7 @@ int analyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
 /// It inserts instructions to do so at InsertPt, and returns the extracted
 /// value.
 Value *getValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
-                            Instruction *InsertPt, const DataLayout &DL);
+                       Instruction *InsertPt, Function *F);
 // This is the same as getValueForLoad, except it performs no insertion.
 // It only allows constant inputs.
 Constant *getConstantValueForLoad(Constant *SrcVal, unsigned Offset,

diff  --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 21eb7f741d7c82..3f306bb52c12a5 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -1096,7 +1096,7 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *Load,
   if (isSimpleValue()) {
     Res = getSimpleValue();
     if (Res->getType() != LoadTy) {
-      Res = getValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
+      Res = getValueForLoad(Res, Offset, LoadTy, InsertPt, Load->getFunction());
 
       LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset
                         << "  " << *getSimpleValue() << '\n'
@@ -1109,7 +1109,8 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *Load,
       Res = CoercedLoad;
       combineMetadataForCSE(CoercedLoad, Load, false);
     } else {
-      Res = getValueForLoad(CoercedLoad, Offset, LoadTy, InsertPt, DL);
+      Res = getValueForLoad(CoercedLoad, Offset, LoadTy, InsertPt,
+                            Load->getFunction());
       // We are adding a new user for this load, for which the original
       // metadata may not hold. Additionally, the new load may have a 
diff erent
       // size and type, so their metadata cannot be combined in any
@@ -1291,7 +1292,8 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
 
         // If MD reported clobber, check it was nested.
         if (DepInfo.isClobber() &&
-            canCoerceMustAliasedValueToLoad(DepLoad, LoadType, DL)) {
+            canCoerceMustAliasedValueToLoad(DepLoad, LoadType,
+                                            DepLoad->getFunction())) {
           const auto ClobberOff = MD->getClobberOffset(DepLoad);
           // GVN has no deal with a negative offset.
           Offset = (ClobberOff == std::nullopt || *ClobberOff < 0)
@@ -1343,7 +1345,7 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
     // 
diff erent types if we have to. If the stored value is convertable to
     // the loaded value, we can reuse it.
     if (!canCoerceMustAliasedValueToLoad(S->getValueOperand(), Load->getType(),
-                                         DL))
+                                         S->getFunction()))
       return std::nullopt;
 
     // Can't forward from non-atomic to atomic without violating memory model.
@@ -1357,7 +1359,8 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
     // If the types mismatch and we can't handle it, reject reuse of the load.
     // If the stored value is larger or equal to the loaded value, we can reuse
     // it.
-    if (!canCoerceMustAliasedValueToLoad(LD, Load->getType(), DL))
+    if (!canCoerceMustAliasedValueToLoad(LD, Load->getType(),
+                                         LD->getFunction()))
       return std::nullopt;
 
     // Can't forward from non-atomic to atomic without violating memory model.

diff  --git a/llvm/lib/Transforms/Utils/VNCoercion.cpp b/llvm/lib/Transforms/Utils/VNCoercion.cpp
index 7a61ab74166389..c1bce01239dcc9 100644
--- a/llvm/lib/Transforms/Utils/VNCoercion.cpp
+++ b/llvm/lib/Transforms/Utils/VNCoercion.cpp
@@ -15,30 +15,42 @@ static bool isFirstClassAggregateOrScalableType(Type *Ty) {
 
 /// Return true if coerceAvailableValueToLoadType will succeed.
 bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
-                                     const DataLayout &DL) {
+                                     Function *F) {
   Type *StoredTy = StoredVal->getType();
-
   if (StoredTy == LoadTy)
     return true;
 
+  const DataLayout &DL = F->getDataLayout();
+  TypeSize MinStoreSize = DL.getTypeSizeInBits(StoredTy);
+  TypeSize LoadSize = DL.getTypeSizeInBits(LoadTy);
   if (isa<ScalableVectorType>(StoredTy) && isa<ScalableVectorType>(LoadTy) &&
-      DL.getTypeSizeInBits(StoredTy) == DL.getTypeSizeInBits(LoadTy))
+      MinStoreSize == LoadSize)
     return true;
 
-  // If the loaded/stored value is a first class array/struct, or scalable type,
-  // don't try to transform them. We need to be able to bitcast to integer.
-  if (isFirstClassAggregateOrScalableType(LoadTy) ||
-      isFirstClassAggregateOrScalableType(StoredTy))
+  // If the loaded/stored value is a first class array/struct, don't try to
+  // transform them. We need to be able to bitcast to integer. For scalable
+  // vectors forwarded to fixed-sized vectors @llvm.vector.extract is used.
+  if (isa<ScalableVectorType>(StoredTy) && isa<FixedVectorType>(LoadTy)) {
+    if (StoredTy->getScalarType() != LoadTy->getScalarType())
+      return false;
+
+    // If it is known at compile-time that the VScale is larger than one,
+    // use that information to allow for wider loads.
+    const auto &Attrs = F->getAttributes().getFnAttrs();
+    unsigned MinVScale = Attrs.getVScaleRangeMin();
+    MinStoreSize =
+        TypeSize::getFixed(MinStoreSize.getKnownMinValue() * MinVScale);
+  } else if (isFirstClassAggregateOrScalableType(LoadTy) ||
+             isFirstClassAggregateOrScalableType(StoredTy)) {
     return false;
-
-  uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy).getFixedValue();
+  }
 
   // The store size must be byte-aligned to support future type casts.
-  if (llvm::alignTo(StoreSize, 8) != StoreSize)
+  if (llvm::alignTo(MinStoreSize, 8) != MinStoreSize)
     return false;
 
   // The store has to be at least as big as the load.
-  if (StoreSize < DL.getTypeSizeInBits(LoadTy).getFixedValue())
+  if (!TypeSize::isKnownGE(MinStoreSize, LoadSize))
     return false;
 
   bool StoredNI = DL.isNonIntegralPointerType(StoredTy->getScalarType());
@@ -57,11 +69,10 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
     return false;
   }
 
-
   // The implementation below uses inttoptr for vectors of unequal size; we
   // can't allow this for non integral pointers. We could teach it to extract
   // exact subvectors if desired.
-  if (StoredNI && StoreSize != DL.getTypeSizeInBits(LoadTy).getFixedValue())
+  if (StoredNI && (StoredTy->isScalableTy() || MinStoreSize != LoadSize))
     return false;
 
   if (StoredTy->isTargetExtTy() || LoadTy->isTargetExtTy())
@@ -77,16 +88,24 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
 ///
 /// If we can't do it, return null.
 Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
-                                      IRBuilderBase &Helper,
-                                      const DataLayout &DL) {
-  assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) &&
+                                      IRBuilderBase &Helper, Function *F) {
+  assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, F) &&
          "precondition violation - materialization can't fail");
+  const DataLayout &DL = F->getDataLayout();
   if (auto *C = dyn_cast<Constant>(StoredVal))
     StoredVal = ConstantFoldConstant(C, DL);
 
   // If this is already the right type, just return it.
   Type *StoredValTy = StoredVal->getType();
 
+  // If this is a scalable vector forwarded to a fixed vector load, create
+  // a @llvm.vector.extract instead of bitcasts.
+  if (isa<ScalableVectorType>(StoredVal->getType()) &&
+      isa<FixedVectorType>(LoadedTy)) {
+    return Helper.CreateIntrinsic(LoadedTy, Intrinsic::vector_extract,
+                                  {StoredVal, Helper.getInt64(0)});
+  }
+
   TypeSize StoredValSize = DL.getTypeSizeInBits(StoredValTy);
   TypeSize LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
 
@@ -220,7 +239,7 @@ int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
   if (isFirstClassAggregateOrScalableType(StoredVal->getType()))
     return -1;
 
-  if (!canCoerceMustAliasedValueToLoad(StoredVal, LoadTy, DL))
+  if (!canCoerceMustAliasedValueToLoad(StoredVal, LoadTy, DepSI->getFunction()))
     return -1;
 
   Value *StorePtr = DepSI->getPointerOperand();
@@ -235,11 +254,11 @@ int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
 /// the other load can feed into the second load.
 int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI,
                                   const DataLayout &DL) {
-  // Cannot handle reading from store of first-class aggregate yet.
-  if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
+  // Cannot handle reading from store of first-class aggregate or scalable type.
+  if (isFirstClassAggregateOrScalableType(DepLI->getType()))
     return -1;
 
-  if (!canCoerceMustAliasedValueToLoad(DepLI, LoadTy, DL))
+  if (!canCoerceMustAliasedValueToLoad(DepLI, LoadTy, DepLI->getFunction()))
     return -1;
 
   Value *DepPtr = DepLI->getPointerOperand();
@@ -315,6 +334,16 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset,
     return SrcVal;
   }
 
+  // For the case of a scalable vector being forwarded to a fixed-sized load,
+  // only equal element types are allowed and a @llvm.vector.extract will be
+  // used instead of bitcasts.
+  if (isa<ScalableVectorType>(SrcVal->getType()) &&
+      isa<FixedVectorType>(LoadTy)) {
+    assert(Offset == 0 &&
+           SrcVal->getType()->getScalarType() == LoadTy->getScalarType());
+    return SrcVal;
+  }
+
   uint64_t StoreSize =
       (DL.getTypeSizeInBits(SrcVal->getType()).getFixedValue() + 7) / 8;
   uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy).getFixedValue() + 7) / 8;
@@ -344,20 +373,24 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset,
 }
 
 Value *getValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
-                       Instruction *InsertPt, const DataLayout &DL) {
+                       Instruction *InsertPt, Function *F) {
+  const DataLayout &DL = F->getDataLayout();
 #ifndef NDEBUG
-  TypeSize SrcValSize = DL.getTypeStoreSize(SrcVal->getType());
+  TypeSize MinSrcValSize = DL.getTypeStoreSize(SrcVal->getType());
   TypeSize LoadSize = DL.getTypeStoreSize(LoadTy);
-  assert(SrcValSize.isScalable() == LoadSize.isScalable());
-  assert((SrcValSize.isScalable() || Offset + LoadSize <= SrcValSize) &&
+  if (MinSrcValSize.isScalable() && !LoadSize.isScalable())
+    MinSrcValSize =
+        TypeSize::getFixed(MinSrcValSize.getKnownMinValue() *
+                           F->getAttributes().getFnAttrs().getVScaleRangeMin());
+  assert((MinSrcValSize.isScalable() || Offset + LoadSize <= MinSrcValSize) &&
          "Expected Offset + LoadSize <= SrcValSize");
-  assert(
-      (!SrcValSize.isScalable() || (Offset == 0 && LoadSize == SrcValSize)) &&
-      "Expected scalable type sizes to match");
+  assert((!MinSrcValSize.isScalable() ||
+          (Offset == 0 && TypeSize::isKnownLE(LoadSize, MinSrcValSize))) &&
+         "Expected offset of zero and LoadSize <= SrcValSize");
 #endif
   IRBuilder<> Builder(InsertPt);
   SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, Builder, DL);
-  return coerceAvailableValueToLoadType(SrcVal, LoadTy, Builder, DL);
+  return coerceAvailableValueToLoadType(SrcVal, LoadTy, Builder, F);
 }
 
 Constant *getConstantValueForLoad(Constant *SrcVal, unsigned Offset,
@@ -408,7 +441,8 @@ Value *getMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
       ++NumBytesSet;
     }
 
-    return coerceAvailableValueToLoadType(Val, LoadTy, Builder, DL);
+    return coerceAvailableValueToLoadType(Val, LoadTy, Builder,
+                                          InsertPt->getFunction());
   }
 
   // Otherwise, this is a memcpy/memmove from a constant global.

diff  --git a/llvm/test/Transforms/GVN/vscale.ll b/llvm/test/Transforms/GVN/vscale.ll
index 67cbfc2f05ef84..d7b07b9891c413 100644
--- a/llvm/test/Transforms/GVN/vscale.ll
+++ b/llvm/test/Transforms/GVN/vscale.ll
@@ -641,3 +641,127 @@ entry:
   call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull %ref.tmp)
   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %15
 }
+
+define <vscale x 4 x float> @scalable_store_to_fixed_load(<vscale x 4 x float> %.coerce) vscale_range(4,4) {
+; CHECK-LABEL: @scalable_store_to_fixed_load(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <16 x float> }, align 64
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE:%.*]], [[DOTCOERCE]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+;
+entry:
+  %retval = alloca { <16 x float> }
+  %0 = fadd <vscale x 4 x float> %.coerce, %.coerce
+  store <vscale x 4 x float> %0, ptr %retval
+  %1 = load <16 x float>, ptr %retval
+  %cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
+  ret <vscale x 4 x float> %cast.scalable
+}
+
+; Here, only the lower bound for the vscale is known, but this is enough to allow a forward to a load to 16 elements.
+define <vscale x 4 x float> @scalable_store_to_fixed_load_only_lower_bound(<vscale x 4 x float> %a) vscale_range(4) {
+; CHECK-LABEL: @scalable_store_to_fixed_load_only_lower_bound(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float> }, align 16
+; CHECK-NEXT:    store <vscale x 4 x float> [[A:%.*]], ptr [[RETVAL]], align 16
+; CHECK-NEXT:    ret <vscale x 4 x float> [[A]]
+;
+entry:
+  %retval = alloca { <vscale x 4 x float> }
+  store <vscale x 4 x float> %a, ptr %retval
+  %1 = load <16 x float>, ptr %retval
+  %cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
+  ret <vscale x 4 x float> %cast.scalable
+}
+
+define <vscale x 4 x float> @scalable_store_to_fixed_load_with_offset(<vscale x 4 x float> %a) vscale_range(4,4) {
+; CHECK-LABEL: @scalable_store_to_fixed_load_with_offset(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca { <32 x float> }, align 128
+; CHECK-NEXT:    store <vscale x 4 x float> [[A:%.*]], ptr [[PTR]], align 16
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[GEP]], align 64
+; CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP0]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[CAST_SCALABLE]]
+;
+entry:
+  %ptr = alloca { <32 x float> }
+  store <vscale x 4 x float> %a, ptr %ptr
+  %gep = getelementptr inbounds i8, ptr %ptr, i64 8
+  %1 = load <16 x float>, ptr %gep
+  %cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
+  ret <vscale x 4 x float> %cast.scalable
+}
+
+define <vscale x 4 x float> @scalable_store_to_fixed_load_unknown_vscale(<vscale x 4 x float> %.coerce) {
+; CHECK-LABEL: @scalable_store_to_fixed_load_unknown_vscale(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <16 x float> }, align 64
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE:%.*]], [[DOTCOERCE]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64
+; CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP1]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[CAST_SCALABLE]]
+;
+entry:
+  %retval = alloca { <16 x float> }
+  %0 = fadd <vscale x 4 x float> %.coerce, %.coerce
+  store <vscale x 4 x float> %0, ptr %retval
+  %1 = load <16 x float>, ptr %retval
+  %cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
+  ret <vscale x 4 x float> %cast.scalable
+}
+
+define <vscale x 4 x float> @scalable_store_to_fixed_load_size_missmatch(<vscale x 4 x float> %.coerce) vscale_range(4,4) {
+; CHECK-LABEL: @scalable_store_to_fixed_load_size_missmatch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <32 x float> }, align 128
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE:%.*]], [[DOTCOERCE]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x float>, ptr [[RETVAL]], align 128
+; CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v32f32(<vscale x 4 x float> poison, <32 x float> [[TMP1]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[CAST_SCALABLE]]
+;
+entry:
+  %retval = alloca { <32 x float> }
+  %0 = fadd <vscale x 4 x float> %.coerce, %.coerce
+  store <vscale x 4 x float> %0, ptr %retval
+  %1 = load <32 x float>, ptr %retval
+  %cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v32f32(<vscale x 4 x float> poison, <32 x float> %1, i64 0)
+  ret <vscale x 4 x float> %cast.scalable
+}
+
+define <vscale x 4 x i32> @scalable_store_to_fixed_load_
diff erent_types(<vscale x 4 x float> %a) vscale_range(4,4) {
+; CHECK-LABEL: @scalable_store_to_fixed_load_
diff erent_types(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca { <16 x float> }, align 64
+; CHECK-NEXT:    store <vscale x 4 x float> [[A:%.*]], ptr [[PTR]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[PTR]], align 64
+; CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> poison, <16 x i32> [[TMP0]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[CAST_SCALABLE]]
+;
+entry:
+  %ptr = alloca { <16 x float> }
+  store <vscale x 4 x float> %a, ptr %ptr
+  %1 = load <16 x i32>, ptr %ptr
+  %cast.scalable = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> poison, <16 x i32> %1, i64 0)
+  ret <vscale x 4 x i32> %cast.scalable
+}
+
+; This function does not have a fixed vscale, but the loaded vector is still known
+; to be smaller or equal in size compared to the stored vector.
+define <4 x float> @scalable_store_to_small_fixed_load(<vscale x 4 x float> %a) {
+; CHECK-LABEL: @scalable_store_to_small_fixed_load(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca <vscale x 4 x float>, align 16
+; CHECK-NEXT:    store <vscale x 4 x float> [[A:%.*]], ptr [[PTR]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.vector.extract.v4f32.nxv4f32(<vscale x 4 x float> [[A]], i64 0)
+; CHECK-NEXT:    ret <4 x float> [[TMP0]]
+;
+entry:
+  %ptr = alloca <vscale x 4 x float>
+  store <vscale x 4 x float> %a, ptr %ptr
+  %1 = load <4 x float>, ptr %ptr
+  ret <4 x float> %1
+}

diff  --git a/llvm/test/Transforms/NewGVN/vscale.ll b/llvm/test/Transforms/NewGVN/vscale.ll
index 500d58baed1a2c..702117213f6eb1 100644
--- a/llvm/test/Transforms/NewGVN/vscale.ll
+++ b/llvm/test/Transforms/NewGVN/vscale.ll
@@ -646,3 +646,131 @@ entry:
   call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull %ref.tmp)
   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %15
 }
+
+define <vscale x 4 x float> @scalable_store_to_fixed_load(<vscale x 4 x float> %.coerce) vscale_range(4,4) {
+; CHECK-LABEL: @scalable_store_to_fixed_load(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <16 x float> }, align 64
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE:%.*]], [[DOTCOERCE]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64
+; CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP1]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[CAST_SCALABLE]]
+;
+entry:
+  %retval = alloca { <16 x float> }
+  %0 = fadd <vscale x 4 x float> %.coerce, %.coerce
+  store <vscale x 4 x float> %0, ptr %retval
+  %1 = load <16 x float>, ptr %retval
+  %cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
+  ret <vscale x 4 x float> %cast.scalable
+}
+
+; Here, only the lower bound for the vscale is known, but this is enough to allow a forward to a load to 16 elements.
+define <vscale x 4 x float> @scalable_store_to_fixed_load_only_lower_bound(<vscale x 4 x float> %a) vscale_range(4) {
+; CHECK-LABEL: @scalable_store_to_fixed_load_only_lower_bound(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <vscale x 4 x float> }, align 16
+; CHECK-NEXT:    store <vscale x 4 x float> [[A:%.*]], ptr [[RETVAL]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64
+; CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP0]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[CAST_SCALABLE]]
+;
+entry:
+  %retval = alloca { <vscale x 4 x float> }
+  store <vscale x 4 x float> %a, ptr %retval
+  %1 = load <16 x float>, ptr %retval
+  %cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
+  ret <vscale x 4 x float> %cast.scalable
+}
+
+define <vscale x 4 x float> @scalable_store_to_fixed_load_with_offset(<vscale x 4 x float> %a) vscale_range(4,4) {
+; CHECK-LABEL: @scalable_store_to_fixed_load_with_offset(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca { <32 x float> }, align 128
+; CHECK-NEXT:    store <vscale x 4 x float> [[A:%.*]], ptr [[PTR]], align 16
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[GEP]], align 64
+; CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP0]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[CAST_SCALABLE]]
+;
+entry:
+  %ptr = alloca { <32 x float> }
+  store <vscale x 4 x float> %a, ptr %ptr
+  %gep = getelementptr inbounds i8, ptr %ptr, i64 8
+  %1 = load <16 x float>, ptr %gep
+  %cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
+  ret <vscale x 4 x float> %cast.scalable
+}
+
+define <vscale x 4 x float> @scalable_store_to_fixed_load_unknown_vscale(<vscale x 4 x float> %.coerce) {
+; CHECK-LABEL: @scalable_store_to_fixed_load_unknown_vscale(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <16 x float> }, align 64
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE:%.*]], [[DOTCOERCE]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64
+; CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP1]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[CAST_SCALABLE]]
+;
+entry:
+  %retval = alloca { <16 x float> }
+  %0 = fadd <vscale x 4 x float> %.coerce, %.coerce
+  store <vscale x 4 x float> %0, ptr %retval
+  %1 = load <16 x float>, ptr %retval
+  %cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
+  ret <vscale x 4 x float> %cast.scalable
+}
+
+define <vscale x 4 x float> @scalable_store_to_fixed_load_size_missmatch(<vscale x 4 x float> %.coerce) vscale_range(4,4) {
+; CHECK-LABEL: @scalable_store_to_fixed_load_size_missmatch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RETVAL:%.*]] = alloca { <32 x float> }, align 128
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE:%.*]], [[DOTCOERCE]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x float>, ptr [[RETVAL]], align 128
+; CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v32f32(<vscale x 4 x float> poison, <32 x float> [[TMP1]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[CAST_SCALABLE]]
+;
+entry:
+  %retval = alloca { <32 x float> }
+  %0 = fadd <vscale x 4 x float> %.coerce, %.coerce
+  store <vscale x 4 x float> %0, ptr %retval
+  %1 = load <32 x float>, ptr %retval
+  %cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v32f32(<vscale x 4 x float> poison, <32 x float> %1, i64 0)
+  ret <vscale x 4 x float> %cast.scalable
+}
+
+define <vscale x 4 x i32> @scalable_store_to_fixed_load_
diff erent_types(<vscale x 4 x float> %a) vscale_range(4,4) {
+; CHECK-LABEL: @scalable_store_to_fixed_load_
diff erent_types(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca { <16 x float> }, align 64
+; CHECK-NEXT:    store <vscale x 4 x float> [[A:%.*]], ptr [[PTR]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[PTR]], align 64
+; CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> poison, <16 x i32> [[TMP0]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[CAST_SCALABLE]]
+;
+entry:
+  %ptr = alloca { <16 x float> }
+  store <vscale x 4 x float> %a, ptr %ptr
+  %1 = load <16 x i32>, ptr %ptr
+  %cast.scalable = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> poison, <16 x i32> %1, i64 0)
+  ret <vscale x 4 x i32> %cast.scalable
+}
+
+; This function does not have a fixed vscale, but the loaded vector is still known
+; to be smaller or equal in size compared to the stored vector.
+define <4 x float> @scalable_store_to_small_fixed_load(<vscale x 4 x float> %a) {
+; CHECK-LABEL: @scalable_store_to_small_fixed_load(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca <vscale x 4 x float>, align 16
+; CHECK-NEXT:    store <vscale x 4 x float> [[A:%.*]], ptr [[PTR]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[PTR]], align 16
+; CHECK-NEXT:    ret <4 x float> [[TMP0]]
+;
+entry:
+  %ptr = alloca <vscale x 4 x float>
+  store <vscale x 4 x float> %a, ptr %ptr
+  %1 = load <4 x float>, ptr %ptr
+  ret <4 x float> %1
+}