[llvm] ad35d91 - [VectorCombine] Enable transform 'foldSingleElementStore' for scalable vector types

Wed Aug 23 02:13:20 PDT 2023

Author: Ben Shi
Date: 2023-08-23T17:12:36+08:00
New Revision: ad35d916cd34154fd88757374d26491611f60cff

URL: https://github.com/llvm/llvm-project/commit/ad35d916cd34154fd88757374d26491611f60cff
DIFF: https://github.com/llvm/llvm-project/commit/ad35d916cd34154fd88757374d26491611f60cff.diff

LOG: [VectorCombine] Enable transform 'foldSingleElementStore' for scalable vector types

The transform 'foldSingleElementStore' can be applied to scalable
vector types if the index is less than the minimum number of elements.

Reviewed By: dmgreen, nikic

Differential Revision: https://reviews.llvm.org/D157676

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/VectorCombine.cpp
    llvm/test/Transforms/VectorCombine/load-insert-store.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 13464c9d3496e0..66e3bcaac0adb2 100644

--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1013,19 +1013,24 @@ class ScalarizationResult {
 
 /// Check if it is legal to scalarize a memory access to \p VecTy at index \p
 /// Idx. \p Idx must access a valid vector element.
-static ScalarizationResult canScalarizeAccess(FixedVectorType *VecTy,
-                                              Value *Idx, Instruction *CtxI,
+static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
+                                              Instruction *CtxI,
                                               AssumptionCache &AC,
                                               const DominatorTree &DT) {
+  // We do checks for both fixed vector types and scalable vector types.
+  // This is the number of elements of fixed vector types,
+  // or the minium number of elements of scalable vector types.
+  uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
+
   if (auto *C = dyn_cast<ConstantInt>(Idx)) {
-    if (C->getValue().ult(VecTy->getNumElements()))
+    if (C->getValue().ult(NumElements))
       return ScalarizationResult::safe();
     return ScalarizationResult::unsafe();
   }
 
   unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
   APInt Zero(IntWidth, 0);
-  APInt MaxElts(IntWidth, VecTy->getNumElements());
+  APInt MaxElts(IntWidth, NumElements);
   ConstantRange ValidIndices(Zero, MaxElts);
   ConstantRange IdxRange(IntWidth, true);
 
@@ -1074,8 +1079,7 @@ static Align computeAlignmentAfterScalarization(Align VectorAlignment,
 //   store i32 %b, i32* %1
 bool VectorCombine::foldSingleElementStore(Instruction &I) {
   auto *SI = cast<StoreInst>(&I);
-  if (!SI->isSimple() ||
-      !isa<FixedVectorType>(SI->getValueOperand()->getType()))
+  if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
     return false;
 
   // TODO: Combine more complicated patterns (multiple insert) by referencing
@@ -1089,7 +1093,7 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) {
     return false;
 
   if (auto *Load = dyn_cast<LoadInst>(Source)) {
-    auto VecTy = cast<FixedVectorType>(SI->getValueOperand()->getType());
+    auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
     const DataLayout &DL = I.getModule()->getDataLayout();
     Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
     // Don't optimize for atomic/volatile load or store. Ensure memory is not

diff  --git a/llvm/test/Transforms/VectorCombine/load-insert-store.ll b/llvm/test/Transforms/VectorCombine/load-insert-store.ll
index 8d847af8d006d2..3b092fedfdfa21 100644
--- a/llvm/test/Transforms/VectorCombine/load-insert-store.ll
+++ b/llvm/test/Transforms/VectorCombine/load-insert-store.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=vector-combine -data-layout=e < %s | FileCheck %s --check-prefixes=CHECK,LE
-; RUN: opt -S -passes=vector-combine -data-layout=E < %s | FileCheck %s --check-prefixes=CHECK,BE
+; RUN: opt -S -passes=vector-combine -data-layout=e < %s | FileCheck %s
+; RUN: opt -S -passes=vector-combine -data-layout=E < %s | FileCheck %s
 
 define void @insert_store(ptr %q, i8 zeroext %s) {
 ; CHECK-LABEL: @insert_store(
@@ -49,9 +49,8 @@ entry:
 define void @insert_store_vscale(ptr %q, i16 zeroext %s) {
 ; CHECK-LABEL: @insert_store_vscale(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 8 x i16>, ptr [[Q:%.*]], align 16
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <vscale x 8 x i16> [[TMP0]], i16 [[S:%.*]], i32 3
-; CHECK-NEXT:    store <vscale x 8 x i16> [[VECINS]], ptr [[Q]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <vscale x 8 x i16>, ptr [[Q:%.*]], i32 0, i32 3
+; CHECK-NEXT:    store i16 [[S:%.*]], ptr [[TMP0]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -251,9 +250,8 @@ define void @insert_store_vscale_nonconst_index_known_valid_by_assume(ptr %q, i8
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IDX:%.*]], 4
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
-; CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 16 x i8>, ptr [[Q:%.*]], align 16
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <vscale x 16 x i8> [[TMP0]], i8 [[S:%.*]], i32 [[IDX]]
-; CHECK-NEXT:    store <vscale x 16 x i8> [[VECINS]], ptr [[Q]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <vscale x 16 x i8>, ptr [[Q:%.*]], i32 0, i32 [[IDX]]
+; CHECK-NEXT:    store i8 [[S:%.*]], ptr [[TMP0]], align 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -351,10 +349,9 @@ entry:
 define void @insert_store_vscale_nonconst_index_known_noundef_and_valid_by_and(ptr %q, i8 zeroext %s, i32 noundef %idx) {
 ; CHECK-LABEL: @insert_store_vscale_nonconst_index_known_noundef_and_valid_by_and(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 16 x i8>, ptr [[Q:%.*]], align 16
 ; CHECK-NEXT:    [[IDX_CLAMPED:%.*]] = and i32 [[IDX:%.*]], 7
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <vscale x 16 x i8> [[TMP0]], i8 [[S:%.*]], i32 [[IDX_CLAMPED]]
-; CHECK-NEXT:    store <vscale x 16 x i8> [[VECINS]], ptr [[Q]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <vscale x 16 x i8>, ptr [[Q:%.*]], i32 0, i32 [[IDX_CLAMPED]]
+; CHECK-NEXT:    store i8 [[S:%.*]], ptr [[TMP0]], align 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -493,10 +490,9 @@ entry:
 define void @insert_store_vscale_nonconst_index_known_noundef_and_valid_by_urem(ptr %q, i8 zeroext %s, i32 noundef %idx) {
 ; CHECK-LABEL: @insert_store_vscale_nonconst_index_known_noundef_and_valid_by_urem(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 16 x i8>, ptr [[Q:%.*]], align 16
 ; CHECK-NEXT:    [[IDX_CLAMPED:%.*]] = urem i32 [[IDX:%.*]], 16
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <vscale x 16 x i8> [[TMP0]], i8 [[S:%.*]], i32 [[IDX_CLAMPED]]
-; CHECK-NEXT:    store <vscale x 16 x i8> [[VECINS]], ptr [[Q]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <vscale x 16 x i8>, ptr [[Q:%.*]], i32 0, i32 [[IDX_CLAMPED]]
+; CHECK-NEXT:    store i8 [[S:%.*]], ptr [[TMP0]], align 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -818,6 +814,3 @@ bb:
 
 declare i32 @bar(i32, i1) readonly
 declare double @llvm.log2.f64(double)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; BE: {{.*}}
-; LE: {{.*}}