[llvm] [SROA]: Only defer trying partial sized ptr or ptr vector types (PR #82279)

Mon Feb 19 13:42:57 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Jeffrey Byrnes (jrbyrnes)

<details>
<summary>Changes</summary>

https://github.com/llvm/llvm-project/pull/77678 introduced regressions of the following type: LoadStoreTys had the better candidate type, but it was not selected since considering LoadStoreTys was deferred until after full consideration of CandidateTys (those that are the same size as the partition).

See: https://godbolt.org/z/G5sxbf1sx

---
Full diff: https://github.com/llvm/llvm-project/pull/82279.diff


2 Files Affected:

- (modified) llvm/lib/Transforms/Scalar/SROA.cpp (+45-28) 
- (modified) llvm/test/Transforms/SROA/vector-promotion.ll (+32) 


``````````diff

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 6c8785d52c4eab..b12d156e067edd 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -2271,6 +2271,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
   // we have different element types.
   SmallVector<VectorType *, 4> CandidateTys;
   SetVector<Type *> LoadStoreTys;
+  SetVector<Type *> DeferredTys;
   Type *CommonEltTy = nullptr;
   VectorType *CommonVecPtrTy = nullptr;
   bool HaveVecPtrTy = false;
@@ -2305,6 +2306,35 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
     }
   };
 
+  auto createAndCheckVectorTypesForPromotion =
+      [&](SetVector<Type *> OtherTys,
+          SmallVector<VectorType *, 4> CandidateTysCopy) {
+        // Consider additional vector types where the element type size is a
+        // multiple of load/store element size.
+        for (Type *Ty : OtherTys) {
+          if (!VectorType::isValidElementType(Ty))
+            continue;
+          unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
+          // Make a copy of CandidateTys and iterate through it, because we
+          // might append to CandidateTys in the loop.
+          for (VectorType *&VTy : CandidateTysCopy) {
+            unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue();
+            unsigned ElementSize =
+                DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
+            if (TypeSize != VectorSize && TypeSize != ElementSize &&
+                VectorSize % TypeSize == 0) {
+              VectorType *NewVTy =
+                  VectorType::get(Ty, VectorSize / TypeSize, false);
+              CheckCandidateType(NewVTy);
+            }
+          }
+        }
+
+        return checkVectorTypesForPromotion(
+            P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
+            HaveCommonVecPtrTy, CommonVecPtrTy);
+      };
+
   // Put load and store types into a set for de-duplication.
   for (const Slice &S : P) {
     Type *Ty;
@@ -2314,44 +2344,31 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
       Ty = SI->getValueOperand()->getType();
     else
       continue;
+
+    auto CandTy =
+        isa<VectorType>(Ty) ? cast<VectorType>(Ty)->getElementType() : Ty;
+    if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() ||
+                                  S.endOffset() != P.endOffset())) {
+      DeferredTys.insert(Ty);
+      continue;
+    }
+
     LoadStoreTys.insert(Ty);
     // Consider any loads or stores that are the exact size of the slice.
     if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
       CheckCandidateType(Ty);
   }
 
-  if (auto *VTy = checkVectorTypesForPromotion(
-          P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
-          HaveCommonVecPtrTy, CommonVecPtrTy))
+  SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
+  if (auto *VTy =
+          createAndCheckVectorTypesForPromotion(LoadStoreTys, CandidateTysCopy))
     return VTy;
 
-  // Consider additional vector types where the element type size is a
-  // multiple of load/store element size.
-  for (Type *Ty : LoadStoreTys) {
-    if (!VectorType::isValidElementType(Ty))
-      continue;
-    unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
-    // Make a copy of CandidateTys and iterate through it, because we might
-    // append to CandidateTys in the loop.
-    SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
-    CandidateTys.clear();
-    for (VectorType *&VTy : CandidateTysCopy) {
-      unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue();
-      unsigned ElementSize =
-          DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
-      if (TypeSize != VectorSize && TypeSize != ElementSize &&
-          VectorSize % TypeSize == 0) {
-        VectorType *NewVTy = VectorType::get(Ty, VectorSize / TypeSize, false);
-        CheckCandidateType(NewVTy);
-      }
-    }
-  }
-
-  return checkVectorTypesForPromotion(P, DL, CandidateTys, HaveCommonEltTy,
-                                      CommonEltTy, HaveVecPtrTy,
-                                      HaveCommonVecPtrTy, CommonVecPtrTy);
+  CandidateTys.clear();
+  return createAndCheckVectorTypesForPromotion(DeferredTys, CandidateTysCopy);
 }
 
+
 /// Test whether a slice of an alloca is valid for integer widening.
 ///
 /// This implements the necessary checking for the \c isIntegerWideningViable
diff --git a/llvm/test/Transforms/SROA/vector-promotion.ll b/llvm/test/Transforms/SROA/vector-promotion.ll
index e48dd5bb392082..dc70520fc5ca70 100644
--- a/llvm/test/Transforms/SROA/vector-promotion.ll
+++ b/llvm/test/Transforms/SROA/vector-promotion.ll
@@ -1392,6 +1392,38 @@ define <4 x ptr> @ptrLoadStoreTysPtr(ptr %init, i64 %val2) {
   ret <4 x ptr> %sroaval
 }
 
+define <4 x i32> @validLoadStoreTy([2 x i64] %cond.coerce) {
+; CHECK-LABEL: @validLoadStoreTy(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[COND_COERCE:%.*]], 0
+; CHECK-NEXT:    [[COND_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[COND_COERCE_FCA_0_EXTRACT]], i32 0
+; CHECK-NEXT:    [[COND_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[COND_COERCE]], 1
+; CHECK-NEXT:    [[COND_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[COND_SROA_0_0_VEC_INSERT]], i64 [[COND_COERCE_FCA_1_EXTRACT]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[COND_SROA_0_8_VEC_INSERT]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+;
+; DEBUG-LABEL: @validLoadStoreTy(
+; DEBUG-NEXT:  entry:
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META553:![0-9]+]], metadata !DIExpression()), !dbg [[DBG557:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META554:![0-9]+]], metadata !DIExpression()), !dbg [[DBG558:![0-9]+]]
+; DEBUG-NEXT:    [[COND_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[COND_COERCE:%.*]], 0, !dbg [[DBG559:![0-9]+]]
+; DEBUG-NEXT:    [[COND_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[COND_COERCE_FCA_0_EXTRACT]], i32 0, !dbg [[DBG559]]
+; DEBUG-NEXT:    [[COND_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[COND_COERCE]], 1, !dbg [[DBG559]]
+; DEBUG-NEXT:    [[COND_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[COND_SROA_0_0_VEC_INSERT]], i64 [[COND_COERCE_FCA_1_EXTRACT]], i32 1, !dbg [[DBG559]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META555:![0-9]+]], metadata !DIExpression()), !dbg [[DBG560:![0-9]+]]
+; DEBUG-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[COND_SROA_0_8_VEC_INSERT]] to <4 x i32>, !dbg [[DBG561:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x i32> [[TMP0]], metadata [[META556:![0-9]+]], metadata !DIExpression()), !dbg [[DBG561]]
+; DEBUG-NEXT:    ret <4 x i32> [[TMP0]], !dbg [[DBG562:![0-9]+]]
+;
+entry:
+  %cond = alloca <4 x i32>, align 8
+  %coerce.dive2 = getelementptr inbounds <4 x i32>, ptr %cond, i32 0, i32 0
+  store [2 x i64] %cond.coerce, ptr %coerce.dive2, align 8
+  %m5 = getelementptr inbounds <4 x i32>, ptr %cond, i32 0, i32 0
+  %0 = load <4 x i32>, ptr %m5, align 8
+  ret <4 x i32> %0
+}
+
 declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
 declare void @llvm.lifetime.end.p0(i64, ptr)
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:

``````````

</details>


https://github.com/llvm/llvm-project/pull/82279