[llvm] [GVN] Handle scalable vectors with the same size in VNCoercion (PR #123984)

Thu Jan 23 09:59:45 PST 2025

https://github.com/davemgreen updated https://github.com/llvm/llvm-project/pull/123984

>From 7171d8500bf8ebd1365cb3d9d3e4769da01fad1f Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Thu, 23 Jan 2025 17:58:11 +0000
Subject: [PATCH] [GVN] Handle scalable vectors with the same size in
 VNCoercion

This allows us to forward to a load even if the types do not match (nxv4i32 vs
nxv2i64 for example). Scalable types are alloweed in
canCoerceMustAliasedValueToLoad so long as the size (minelts * scalarsize) is
the same, and some follow-on code is adjusted to make sure it handles scalable
sizes correctly. Methods like analyzeLoadFromClobberingWrite and
analyzeLoadFromClobberingStore still do nothing for scalable vectors, as
Offsets and mismatching types are not supported.
---
 llvm/lib/Transforms/Utils/VNCoercion.cpp | 30 ++++++++++++++++++------
 llvm/test/Transforms/GVN/vscale.ll       | 24 +++++++++----------
 2 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/VNCoercion.cpp b/llvm/lib/Transforms/Utils/VNCoercion.cpp
index 1e0ae280516410..7a61ab74166389 100644
--- a/llvm/lib/Transforms/Utils/VNCoercion.cpp
+++ b/llvm/lib/Transforms/Utils/VNCoercion.cpp
@@ -21,6 +21,10 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
   if (StoredTy == LoadTy)
     return true;
 
+  if (isa<ScalableVectorType>(StoredTy) && isa<ScalableVectorType>(LoadTy) &&
+      DL.getTypeSizeInBits(StoredTy) == DL.getTypeSizeInBits(LoadTy))
+    return true;
+
   // If the loaded/stored value is a first class array/struct, or scalable type,
   // don't try to transform them. We need to be able to bitcast to integer.
   if (isFirstClassAggregateOrScalableType(LoadTy) ||
@@ -83,8 +87,8 @@ Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
   // If this is already the right type, just return it.
   Type *StoredValTy = StoredVal->getType();
 
-  uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy).getFixedValue();
-  uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy).getFixedValue();
+  TypeSize StoredValSize = DL.getTypeSizeInBits(StoredValTy);
+  TypeSize LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
 
   // If the store and reload are the same size, we can always reuse it.
   if (StoredValSize == LoadedValSize) {
@@ -118,7 +122,8 @@ Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
   // If the loaded value is smaller than the available value, then we can
   // extract out a piece from it.  If the available value is too small, then we
   // can't do anything.
-  assert(StoredValSize >= LoadedValSize &&
+  assert(!StoredValSize.isScalable() &&
+         TypeSize::isKnownGE(StoredValSize, LoadedValSize) &&
          "canCoerceMustAliasedValueToLoad fail");
 
   // Convert source pointers to integers, which can be manipulated.
@@ -303,6 +308,13 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset,
     return SrcVal;
   }
 
+  // Return scalable values directly to avoid needing to bitcast to integer
+  // types, as we do not support non-zero Offsets.
+  if (isa<ScalableVectorType>(LoadTy)) {
+    assert(Offset == 0 && "Expected a zero offset for scalable types");
+    return SrcVal;
+  }
+
   uint64_t StoreSize =
       (DL.getTypeSizeInBits(SrcVal->getType()).getFixedValue() + 7) / 8;
   uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy).getFixedValue() + 7) / 8;
@@ -333,11 +345,15 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset,
 
 Value *getValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
                        Instruction *InsertPt, const DataLayout &DL) {
-
 #ifndef NDEBUG
-  unsigned SrcValSize = DL.getTypeStoreSize(SrcVal->getType()).getFixedValue();
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedValue();
-  assert(Offset + LoadSize <= SrcValSize);
+  TypeSize SrcValSize = DL.getTypeStoreSize(SrcVal->getType());
+  TypeSize LoadSize = DL.getTypeStoreSize(LoadTy);
+  assert(SrcValSize.isScalable() == LoadSize.isScalable());
+  assert((SrcValSize.isScalable() || Offset + LoadSize <= SrcValSize) &&
+         "Expected Offset + LoadSize <= SrcValSize");
+  assert(
+      (!SrcValSize.isScalable() || (Offset == 0 && LoadSize == SrcValSize)) &&
+      "Expected scalable type sizes to match");
 #endif
   IRBuilder<> Builder(InsertPt);
   SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, Builder, DL);
diff --git a/llvm/test/Transforms/GVN/vscale.ll b/llvm/test/Transforms/GVN/vscale.ll
index f6e0f8c1a64944..67cbfc2f05ef84 100644
--- a/llvm/test/Transforms/GVN/vscale.ll
+++ b/llvm/test/Transforms/GVN/vscale.ll
@@ -393,7 +393,7 @@ if.else:
 define <vscale x 16 x i8> @load_v16i8_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x)  {
 ; CHECK-LABEL: @load_v16i8_store_v4i32_forward_load(
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16
+; CHECK-NEXT:    [[LOAD:%.*]] = bitcast <vscale x 4 x i32> [[X]] to <vscale x 16 x i8>
 ; CHECK-NEXT:    ret <vscale x 16 x i8> [[LOAD]]
 ;
   store <vscale x 4 x i32> %x, ptr %p
@@ -404,7 +404,7 @@ define <vscale x 16 x i8> @load_v16i8_store_v4i32_forward_load(ptr %p, <vscale x
 define <vscale x 4 x float> @load_v4f32_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x)  {
 ; CHECK-LABEL: @load_v4f32_store_v4i32_forward_load(
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 4 x float>, ptr [[P]], align 16
+; CHECK-NEXT:    [[LOAD:%.*]] = bitcast <vscale x 4 x i32> [[X]] to <vscale x 4 x float>
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
 ;
   store <vscale x 4 x i32> %x, ptr %p
@@ -415,7 +415,7 @@ define <vscale x 4 x float> @load_v4f32_store_v4i32_forward_load(ptr %p, <vscale
 define <vscale x 4 x float> @load_v4f32_store_v16i8_forward_load(ptr %p, <vscale x 16 x i8> %x)  {
 ; CHECK-LABEL: @load_v4f32_store_v16i8_forward_load(
 ; CHECK-NEXT:    store <vscale x 16 x i8> [[X:%.*]], ptr [[P:%.*]], align 16
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 4 x float>, ptr [[P]], align 16
+; CHECK-NEXT:    [[LOAD:%.*]] = bitcast <vscale x 16 x i8> [[X]] to <vscale x 4 x float>
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
 ;
   store <vscale x 16 x i8> %x, ptr %p
@@ -426,7 +426,7 @@ define <vscale x 4 x float> @load_v4f32_store_v16i8_forward_load(ptr %p, <vscale
 define <vscale x 4 x i32> @load_v4i32_store_v4f32_forward_load(ptr %p, <vscale x 4 x float> %x)  {
 ; CHECK-LABEL: @load_v4i32_store_v4f32_forward_load(
 ; CHECK-NEXT:    store <vscale x 4 x float> [[X:%.*]], ptr [[P:%.*]], align 16
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[P]], align 16
+; CHECK-NEXT:    [[LOAD:%.*]] = bitcast <vscale x 4 x float> [[X]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[LOAD]]
 ;
   store <vscale x 4 x float> %x, ptr %p
@@ -496,7 +496,8 @@ define <vscale x 2 x i32> @load_v2i32_store_v4i32_forward_load_offsetc(ptr %p, <
 define <vscale x 2 x ptr> @load_v2p0_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x)  {
 ; CHECK-LABEL: @load_v2p0_store_v4i32_forward_load(
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[P]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 4 x i32> [[X]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[LOAD:%.*]] = inttoptr <vscale x 2 x i64> [[TMP1]] to <vscale x 2 x ptr>
 ; CHECK-NEXT:    ret <vscale x 2 x ptr> [[LOAD]]
 ;
   store <vscale x 4 x i32> %x, ptr %p
@@ -507,7 +508,7 @@ define <vscale x 2 x ptr> @load_v2p0_store_v4i32_forward_load(ptr %p, <vscale x
 define <vscale x 2 x i64> @load_v2i64_store_v2p0_forward_load(ptr %p, <vscale x 2 x ptr> %x)  {
 ; CHECK-LABEL: @load_v2i64_store_v2p0_forward_load(
 ; CHECK-NEXT:    store <vscale x 2 x ptr> [[X:%.*]], ptr [[P:%.*]], align 16
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[P]], align 16
+; CHECK-NEXT:    [[LOAD:%.*]] = ptrtoint <vscale x 2 x ptr> [[X]] to <vscale x 2 x i64>
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[LOAD]]
 ;
   store <vscale x 2 x ptr> %x, ptr %p
@@ -540,8 +541,7 @@ define <16 x i8> @load_v16i8_store_nxv4i32_forward_load(ptr %p, <vscale x 4 x i3
 define <vscale x 16 x i8> @load_v16i8_store_v4i32_forward_constant(ptr %p)  {
 ; CHECK-LABEL: @load_v16i8_store_v4i32_forward_constant(
 ; CHECK-NEXT:    store <vscale x 4 x i32> splat (i32 4), ptr [[P:%.*]], align 16
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[LOAD]]
+; CHECK-NEXT:    ret <vscale x 16 x i8> bitcast (<vscale x 4 x i32> splat (i32 4) to <vscale x 16 x i8>)
 ;
   store <vscale x 4 x i32> splat (i32 4), ptr %p
   %load = load <vscale x 16 x i8>, ptr %p
@@ -590,13 +590,13 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
 ; CHECK-NEXT:    [[REF_TMP_REPACK5:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[A_ELT6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[A]], 3
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[A_ELT6]], ptr [[REF_TMP_REPACK5]], align 16
-; CHECK-NEXT:    [[DOTUNPACK:%.*]] = load <vscale x 16 x i8>, ptr [[REF_TMP]], align 16
+; CHECK-NEXT:    [[DOTUNPACK:%.*]] = bitcast <vscale x 4 x i32> [[A_ELT]] to <vscale x 16 x i8>
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[DOTUNPACK]], 0
-; CHECK-NEXT:    [[DOTUNPACK8:%.*]] = load <vscale x 16 x i8>, ptr [[REF_TMP_REPACK1]], align 16
+; CHECK-NEXT:    [[DOTUNPACK8:%.*]] = bitcast <vscale x 4 x i32> [[A_ELT2]] to <vscale x 16 x i8>
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[DOTUNPACK8]], 1
-; CHECK-NEXT:    [[DOTUNPACK10:%.*]] = load <vscale x 16 x i8>, ptr [[REF_TMP_REPACK3]], align 16
+; CHECK-NEXT:    [[DOTUNPACK10:%.*]] = bitcast <vscale x 4 x i32> [[A_ELT4]] to <vscale x 16 x i8>
 ; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[DOTUNPACK10]], 2
-; CHECK-NEXT:    [[DOTUNPACK12:%.*]] = load <vscale x 16 x i8>, ptr [[REF_TMP_REPACK5]], align 16
+; CHECK-NEXT:    [[DOTUNPACK12:%.*]] = bitcast <vscale x 4 x i32> [[A_ELT6]] to <vscale x 16 x i8>
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[DOTUNPACK12]], 3
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull [[REF_TMP]])
 ; CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]