[llvm] 775d0f3 - [GVN] Handle scalable vectors with the same size in VNCoercion (#123984)

Thu Jan 23 10:43:54 PST 2025

Author: David Green
Date: 2025-01-23T18:43:50Z
New Revision: 775d0f36f74851172f84074d90cde29e181b3edd

URL: https://github.com/llvm/llvm-project/commit/775d0f36f74851172f84074d90cde29e181b3edd
DIFF: https://github.com/llvm/llvm-project/commit/775d0f36f74851172f84074d90cde29e181b3edd.diff

LOG: [GVN] Handle scalable vectors with the same size in VNCoercion (#123984)

This allows us to forward to a load even if the types do not match
(nxv4i32 vs nxv2i64 for example). Scalable types are allowed in
canCoerceMustAliasedValueToLoad so long as the size (minelts *
scalarsize) is the same, and some follow-on code is adjusted to make
sure it handles scalable sizes correctly. Methods like
analyzeLoadFromClobberingWrite and analyzeLoadFromClobberingStore still
do nothing for scalable vectors, as Offsets and mismatching types are
not supported.

Added: 
    

Modified: 
    llvm/lib/Transforms/Utils/VNCoercion.cpp
    llvm/test/Transforms/GVN/vscale.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Utils/VNCoercion.cpp b/llvm/lib/Transforms/Utils/VNCoercion.cpp
index 1e0ae280516410..7a61ab74166389 100644

--- a/llvm/lib/Transforms/Utils/VNCoercion.cpp
+++ b/llvm/lib/Transforms/Utils/VNCoercion.cpp
@@ -21,6 +21,10 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
   if (StoredTy == LoadTy)
     return true;
 
+  if (isa<ScalableVectorType>(StoredTy) && isa<ScalableVectorType>(LoadTy) &&
+      DL.getTypeSizeInBits(StoredTy) == DL.getTypeSizeInBits(LoadTy))
+    return true;
+
   // If the loaded/stored value is a first class array/struct, or scalable type,
   // don't try to transform them. We need to be able to bitcast to integer.
   if (isFirstClassAggregateOrScalableType(LoadTy) ||
@@ -83,8 +87,8 @@ Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
   // If this is already the right type, just return it.
   Type *StoredValTy = StoredVal->getType();
 
-  uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy).getFixedValue();
-  uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy).getFixedValue();
+  TypeSize StoredValSize = DL.getTypeSizeInBits(StoredValTy);
+  TypeSize LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
 
   // If the store and reload are the same size, we can always reuse it.
   if (StoredValSize == LoadedValSize) {
@@ -118,7 +122,8 @@ Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
   // If the loaded value is smaller than the available value, then we can
   // extract out a piece from it.  If the available value is too small, then we
   // can't do anything.
-  assert(StoredValSize >= LoadedValSize &&
+  assert(!StoredValSize.isScalable() &&
+         TypeSize::isKnownGE(StoredValSize, LoadedValSize) &&
          "canCoerceMustAliasedValueToLoad fail");
 
   // Convert source pointers to integers, which can be manipulated.
@@ -303,6 +308,13 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset,
     return SrcVal;
   }
 
+  // Return scalable values directly to avoid needing to bitcast to integer
+  // types, as we do not support non-zero Offsets.
+  if (isa<ScalableVectorType>(LoadTy)) {
+    assert(Offset == 0 && "Expected a zero offset for scalable types");
+    return SrcVal;
+  }
+
   uint64_t StoreSize =
       (DL.getTypeSizeInBits(SrcVal->getType()).getFixedValue() + 7) / 8;
   uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy).getFixedValue() + 7) / 8;
@@ -333,11 +345,15 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset,
 
 Value *getValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
                        Instruction *InsertPt, const DataLayout &DL) {
-
 #ifndef NDEBUG
-  unsigned SrcValSize = DL.getTypeStoreSize(SrcVal->getType()).getFixedValue();
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedValue();
-  assert(Offset + LoadSize <= SrcValSize);
+  TypeSize SrcValSize = DL.getTypeStoreSize(SrcVal->getType());
+  TypeSize LoadSize = DL.getTypeStoreSize(LoadTy);
+  assert(SrcValSize.isScalable() == LoadSize.isScalable());
+  assert((SrcValSize.isScalable() || Offset + LoadSize <= SrcValSize) &&
+         "Expected Offset + LoadSize <= SrcValSize");
+  assert(
+      (!SrcValSize.isScalable() || (Offset == 0 && LoadSize == SrcValSize)) &&
+      "Expected scalable type sizes to match");
 #endif
   IRBuilder<> Builder(InsertPt);
   SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, Builder, DL);

diff  --git a/llvm/test/Transforms/GVN/vscale.ll b/llvm/test/Transforms/GVN/vscale.ll
index f6e0f8c1a64944..67cbfc2f05ef84 100644
--- a/llvm/test/Transforms/GVN/vscale.ll
+++ b/llvm/test/Transforms/GVN/vscale.ll
@@ -393,7 +393,7 @@ if.else:
 define <vscale x 16 x i8> @load_v16i8_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x)  {
 ; CHECK-LABEL: @load_v16i8_store_v4i32_forward_load(
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16
+; CHECK-NEXT:    [[LOAD:%.*]] = bitcast <vscale x 4 x i32> [[X]] to <vscale x 16 x i8>
 ; CHECK-NEXT:    ret <vscale x 16 x i8> [[LOAD]]
 ;
   store <vscale x 4 x i32> %x, ptr %p
@@ -404,7 +404,7 @@ define <vscale x 16 x i8> @load_v16i8_store_v4i32_forward_load(ptr %p, <vscale x
 define <vscale x 4 x float> @load_v4f32_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x)  {
 ; CHECK-LABEL: @load_v4f32_store_v4i32_forward_load(
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 4 x float>, ptr [[P]], align 16
+; CHECK-NEXT:    [[LOAD:%.*]] = bitcast <vscale x 4 x i32> [[X]] to <vscale x 4 x float>
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
 ;
   store <vscale x 4 x i32> %x, ptr %p
@@ -415,7 +415,7 @@ define <vscale x 4 x float> @load_v4f32_store_v4i32_forward_load(ptr %p, <vscale
 define <vscale x 4 x float> @load_v4f32_store_v16i8_forward_load(ptr %p, <vscale x 16 x i8> %x)  {
 ; CHECK-LABEL: @load_v4f32_store_v16i8_forward_load(
 ; CHECK-NEXT:    store <vscale x 16 x i8> [[X:%.*]], ptr [[P:%.*]], align 16
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 4 x float>, ptr [[P]], align 16
+; CHECK-NEXT:    [[LOAD:%.*]] = bitcast <vscale x 16 x i8> [[X]] to <vscale x 4 x float>
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
 ;
   store <vscale x 16 x i8> %x, ptr %p
@@ -426,7 +426,7 @@ define <vscale x 4 x float> @load_v4f32_store_v16i8_forward_load(ptr %p, <vscale
 define <vscale x 4 x i32> @load_v4i32_store_v4f32_forward_load(ptr %p, <vscale x 4 x float> %x)  {
 ; CHECK-LABEL: @load_v4i32_store_v4f32_forward_load(
 ; CHECK-NEXT:    store <vscale x 4 x float> [[X:%.*]], ptr [[P:%.*]], align 16
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[P]], align 16
+; CHECK-NEXT:    [[LOAD:%.*]] = bitcast <vscale x 4 x float> [[X]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[LOAD]]
 ;
   store <vscale x 4 x float> %x, ptr %p
@@ -496,7 +496,8 @@ define <vscale x 2 x i32> @load_v2i32_store_v4i32_forward_load_offsetc(ptr %p, <
 define <vscale x 2 x ptr> @load_v2p0_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x)  {
 ; CHECK-LABEL: @load_v2p0_store_v4i32_forward_load(
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[P]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 4 x i32> [[X]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[LOAD:%.*]] = inttoptr <vscale x 2 x i64> [[TMP1]] to <vscale x 2 x ptr>
 ; CHECK-NEXT:    ret <vscale x 2 x ptr> [[LOAD]]
 ;
   store <vscale x 4 x i32> %x, ptr %p
@@ -507,7 +508,7 @@ define <vscale x 2 x ptr> @load_v2p0_store_v4i32_forward_load(ptr %p, <vscale x
 define <vscale x 2 x i64> @load_v2i64_store_v2p0_forward_load(ptr %p, <vscale x 2 x ptr> %x)  {
 ; CHECK-LABEL: @load_v2i64_store_v2p0_forward_load(
 ; CHECK-NEXT:    store <vscale x 2 x ptr> [[X:%.*]], ptr [[P:%.*]], align 16
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[P]], align 16
+; CHECK-NEXT:    [[LOAD:%.*]] = ptrtoint <vscale x 2 x ptr> [[X]] to <vscale x 2 x i64>
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[LOAD]]
 ;
   store <vscale x 2 x ptr> %x, ptr %p
@@ -540,8 +541,7 @@ define <16 x i8> @load_v16i8_store_nxv4i32_forward_load(ptr %p, <vscale x 4 x i3
 define <vscale x 16 x i8> @load_v16i8_store_v4i32_forward_constant(ptr %p)  {
 ; CHECK-LABEL: @load_v16i8_store_v4i32_forward_constant(
 ; CHECK-NEXT:    store <vscale x 4 x i32> splat (i32 4), ptr [[P:%.*]], align 16
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[LOAD]]
+; CHECK-NEXT:    ret <vscale x 16 x i8> bitcast (<vscale x 4 x i32> splat (i32 4) to <vscale x 16 x i8>)
 ;
   store <vscale x 4 x i32> splat (i32 4), ptr %p
   %load = load <vscale x 16 x i8>, ptr %p
@@ -590,13 +590,13 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
 ; CHECK-NEXT:    [[REF_TMP_REPACK5:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[A_ELT6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[A]], 3
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[A_ELT6]], ptr [[REF_TMP_REPACK5]], align 16
-; CHECK-NEXT:    [[DOTUNPACK:%.*]] = load <vscale x 16 x i8>, ptr [[REF_TMP]], align 16
+; CHECK-NEXT:    [[DOTUNPACK:%.*]] = bitcast <vscale x 4 x i32> [[A_ELT]] to <vscale x 16 x i8>
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[DOTUNPACK]], 0
-; CHECK-NEXT:    [[DOTUNPACK8:%.*]] = load <vscale x 16 x i8>, ptr [[REF_TMP_REPACK1]], align 16
+; CHECK-NEXT:    [[DOTUNPACK8:%.*]] = bitcast <vscale x 4 x i32> [[A_ELT2]] to <vscale x 16 x i8>
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[DOTUNPACK8]], 1
-; CHECK-NEXT:    [[DOTUNPACK10:%.*]] = load <vscale x 16 x i8>, ptr [[REF_TMP_REPACK3]], align 16
+; CHECK-NEXT:    [[DOTUNPACK10:%.*]] = bitcast <vscale x 4 x i32> [[A_ELT4]] to <vscale x 16 x i8>
 ; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[DOTUNPACK10]], 2
-; CHECK-NEXT:    [[DOTUNPACK12:%.*]] = load <vscale x 16 x i8>, ptr [[REF_TMP_REPACK5]], align 16
+; CHECK-NEXT:    [[DOTUNPACK12:%.*]] = bitcast <vscale x 4 x i32> [[A_ELT6]] to <vscale x 16 x i8>
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[DOTUNPACK12]], 3
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull [[REF_TMP]])
 ; CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]