[llvm] 6d2df18 - [VectorComine] Restrict single-element-store index to inbounds constant

Tue May 11 22:21:33 PDT 2021

Author: Qiu Chaofan
Date: 2021-05-12T13:18:20+08:00
New Revision: 6d2df181638a34f5d4ebc0c92cfb6a30abf8588d

URL: https://github.com/llvm/llvm-project/commit/6d2df181638a34f5d4ebc0c92cfb6a30abf8588d
DIFF: https://github.com/llvm/llvm-project/commit/6d2df181638a34f5d4ebc0c92cfb6a30abf8588d.diff

LOG: [VectorComine] Restrict single-element-store index to inbounds constant

Vector single element update optimization is landed in 2db4979. But the
scope needs restriction. This patch restricts the index to inbounds and
vector must be fixed sized. In future, we may use value tracking to
relax constant restrictions.

Reviewed By: fhahn

Differential Revision: https://reviews.llvm.org/D102146

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/VectorCombine.cpp
    llvm/test/Transforms/VectorCombine/load-insert-store.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index cee7880d189a5..c254f61285819 100644

--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -781,24 +781,29 @@ static bool isMemModifiedBetween(BasicBlock::iterator Begin,
 //   store i32 %b, i32* %1
 bool VectorCombine::foldSingleElementStore(Instruction &I) {
   StoreInst *SI = dyn_cast<StoreInst>(&I);
-  if (!SI || !SI->isSimple() || !SI->getValueOperand()->getType()->isVectorTy())
+  if (!SI || !SI->isSimple() ||
+      !isa<FixedVectorType>(SI->getValueOperand()->getType()))
     return false;
 
   // TODO: Combine more complicated patterns (multiple insert) by referencing
   // TargetTransformInfo.
   Instruction *Source;
-  Value *NewElement, *Idx;
+  Value *NewElement;
+  ConstantInt *Idx;
   if (!match(SI->getValueOperand(),
              m_InsertElt(m_Instruction(Source), m_Value(NewElement),
-                         m_Value(Idx))))
+                         m_ConstantInt(Idx))))
     return false;
 
   if (auto *Load = dyn_cast<LoadInst>(Source)) {
+    auto VecTy = cast<FixedVectorType>(SI->getValueOperand()->getType());
     const DataLayout &DL = I.getModule()->getDataLayout();
     Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
-    // Don't optimize for atomic/volatile load or stores.
+    // Don't optimize for atomic/volatile load or store. Ensure memory is not
+    // modified between, vector type matches store size, and index is inbounds.
     if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
         !DL.typeSizeEqualsStoreSize(Load->getType()) ||
+        Idx->uge(VecTy->getNumElements()) ||
         SrcAddr != SI->getPointerOperand()->stripPointerCasts() ||
         isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
                              MemoryLocation::get(SI), AA))

diff  --git a/llvm/test/Transforms/VectorCombine/load-insert-store.ll b/llvm/test/Transforms/VectorCombine/load-insert-store.ll
index 3a6a7aa1cf71c..71feaa79a95a3 100644
--- a/llvm/test/Transforms/VectorCombine/load-insert-store.ll
+++ b/llvm/test/Transforms/VectorCombine/load-insert-store.ll
@@ -30,6 +30,37 @@ entry:
   ret void
 }
 
+; To verify case when index is out of bounds
+define void @insert_store_outofbounds(<8 x i16>* %q, i16 zeroext %s) {
+; CHECK-LABEL: @insert_store_outofbounds(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, <8 x i16>* [[Q:%.*]], align 16
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[S:%.*]], i32 9
+; CHECK-NEXT:    store <8 x i16> [[VECINS]], <8 x i16>* [[Q]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <8 x i16>, <8 x i16>* %q
+  %vecins = insertelement <8 x i16> %0, i16 %s, i32 9
+  store <8 x i16> %vecins, <8 x i16>* %q
+  ret void
+}
+
+define void @insert_store_vscale(<vscale x 8 x i16>* %q, i16 zeroext %s) {
+; CHECK-LABEL: @insert_store_vscale(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 8 x i16>, <vscale x 8 x i16>* [[Q:%.*]], align 16
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <vscale x 8 x i16> [[TMP0]], i16 [[S:%.*]], i32 3
+; CHECK-NEXT:    store <vscale x 8 x i16> [[VECINS]], <vscale x 8 x i16>* [[Q]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <vscale x 8 x i16>, <vscale x 8 x i16>* %q
+  %vecins = insertelement <vscale x 8 x i16> %0, i16 %s, i32 3
+  store <vscale x 8 x i16> %vecins, <vscale x 8 x i16>* %q
+  ret void
+}
+
 define void @insert_store_v9i4(<9 x i4>* %q, i4 zeroext %s) {
 ; CHECK-LABEL: @insert_store_v9i4(
 ; CHECK-NEXT:  entry:
@@ -82,8 +113,9 @@ cont:
 define void @insert_store_nonconst(<16 x i8>* %q, i8 zeroext %s, i32 %idx) {
 ; CHECK-LABEL: @insert_store_nonconst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[Q:%.*]], i32 0, i32 [[IDX:%.*]]
-; CHECK-NEXT:    store i8 [[S:%.*]], i8* [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[Q:%.*]], align 16
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <16 x i8> [[TMP0]], i8 [[S:%.*]], i32 [[IDX:%.*]]
+; CHECK-NEXT:    store <16 x i8> [[VECINS]], <16 x i8>* [[Q]], align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -93,17 +125,17 @@ entry:
   ret void
 }
 
-define void @insert_store_ptr_strip(<16 x i8>* %q, i8 zeroext %s, i32 %idx) {
+define void @insert_store_ptr_strip(<16 x i8>* %q, i8 zeroext %s) {
 ; CHECK-LABEL: @insert_store_ptr_strip(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ADDR0:%.*]] = bitcast <16 x i8>* [[Q:%.*]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[Q]], i32 0, i32 [[IDX:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[Q]], i32 0, i32 3
 ; CHECK-NEXT:    store i8 [[S:%.*]], i8* [[TMP0]], align 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %0 = load <16 x i8>, <16 x i8>* %q
-  %vecins = insertelement <16 x i8> %0, i8 %s, i32 %idx
+  %vecins = insertelement <16 x i8> %0, i8 %s, i32 3
   %addr0 = bitcast <16 x i8>* %q to <2 x i64>*
   %addr1 = getelementptr <2 x i64>, <2 x i64>* %addr0, i64 0
   %addr2 = bitcast <2 x i64>* %addr1 to <16 x i8>*