[llvm-commits] [llvm] r158623 - in /llvm/trunk: lib/Transforms/Scalar/ScalarReplAggregates.cpp test/Transforms/ScalarRepl/dynamic-vector-gep.ll

Sat Jun 16 20:58:27 PDT 2012

Author: pete
Date: Sat Jun 16 22:58:26 2012
New Revision: 158623

URL: http://llvm.org/viewvc/llvm-project?rev=158623&view=rev
Log:
Now that SROA can form alloca's for dynamic vector accesses, further improve it to be able to replace operations on these vector alloca's with insert/extract element insts

Modified:
    llvm/trunk/lib/Transforms/Scalar/ScalarReplAggregates.cpp
    llvm/trunk/test/Transforms/ScalarRepl/dynamic-vector-gep.ll

Modified: llvm/trunk/lib/Transforms/Scalar/ScalarReplAggregates.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/ScalarReplAggregates.cpp?rev=158623&r1=158622&r2=158623&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Scalar/ScalarReplAggregates.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/ScalarReplAggregates.cpp Sat Jun 16 22:58:26 2012
@@ -264,23 +264,31 @@
   /// large integers unless there is some potential for optimization.
   bool HadNonMemTransferAccess;
 
+  /// HadDynamicAccess - True if some element of this alloca was dynamic.
+  /// We don't yet have support for turning a dynamic access into a large
+  /// integer.
+  bool HadDynamicAccess;
+
 public:
   explicit ConvertToScalarInfo(unsigned Size, const TargetData &td)
     : AllocaSize(Size), TD(td), IsNotTrivial(false), ScalarKind(Unknown),
-      VectorTy(0), HadNonMemTransferAccess(false) { }
+      VectorTy(0), HadNonMemTransferAccess(false), HadDynamicAccess(false) { }
 
   AllocaInst *TryConvert(AllocaInst *AI);
 
 private:
-  bool CanConvertToScalar(Value *V, uint64_t Offset);
+  bool CanConvertToScalar(Value *V, uint64_t Offset, Value* NonConstantIdx);
   void MergeInTypeForLoadOrStore(Type *In, uint64_t Offset);
   bool MergeInVectorType(VectorType *VInTy, uint64_t Offset);
-  void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset);
+  void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset,
+                           Value *NonConstantIdx);
 
   Value *ConvertScalar_ExtractValue(Value *NV, Type *ToType,
-                                    uint64_t Offset, IRBuilder<> &Builder);
+                                    uint64_t Offset, Value* NonConstantIdx,
+                                    IRBuilder<> &Builder);
   Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal,
-                                   uint64_t Offset, IRBuilder<> &Builder);
+                                   uint64_t Offset, Value* NonConstantIdx,
+                                   IRBuilder<> &Builder);
 };
 } // end anonymous namespace.
 
@@ -291,7 +299,7 @@
 AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
   // If we can't convert this scalar, or if mem2reg can trivially do it, bail
   // out.
-  if (!CanConvertToScalar(AI, 0) || !IsNotTrivial)
+  if (!CanConvertToScalar(AI, 0, 0) || !IsNotTrivial)
     return 0;
 
   // If an alloca has only memset / memcpy uses, it may still have an Unknown
@@ -319,13 +327,18 @@
     if ((ScalarKind == ImplicitVector || ScalarKind == Integer) &&
         !HadNonMemTransferAccess && !TD.fitsInLegalInteger(BitWidth))
       return 0;
+    // Dynamic accesses on integers aren't yet supported.  They need us to shift
+    // by a dynamic amount which could be difficult to work out as we might not
+    // know whether to use a left or right shift.
+    if (ScalarKind == Integer && HadDynamicAccess)
+      return 0;
 
     DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n");
     // Create and insert the integer alloca.
     NewTy = IntegerType::get(AI->getContext(), BitWidth);
   }
   AllocaInst *NewAI = new AllocaInst(NewTy, 0, "", AI->getParent()->begin());
-  ConvertUsesToScalar(AI, NewAI, 0);
+  ConvertUsesToScalar(AI, NewAI, 0, 0);
   return NewAI;
 }
 
@@ -412,7 +425,8 @@
 ///
 /// If we see at least one access to the value that is as a vector type, set the
 /// SawVec flag.
-bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
+bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset,
+                                             Value* NonConstantIdx) {
   for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) {
     Instruction *User = cast<Instruction>(*UI);
 
@@ -442,24 +456,35 @@
     if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) {
       if (!onlyUsedByLifetimeMarkers(BCI))
         IsNotTrivial = true;  // Can't be mem2reg'd.
-      if (!CanConvertToScalar(BCI, Offset))
+      if (!CanConvertToScalar(BCI, Offset, NonConstantIdx))
         return false;
       continue;
     }
 
     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
       // If this is a GEP with a variable indices, we can't handle it.
-      if (!GEP->hasAllConstantIndices())
+      PointerType* PtrTy = dyn_cast<PointerType>(GEP->getPointerOperandType());
+      if (!PtrTy)
         return false;
 
       // Compute the offset that this GEP adds to the pointer.
       SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
-      if (!GEP->getPointerOperandType()->isPointerTy())
-        return false;
-      uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(),
+      Value *GEPNonConstantIdx = 0;
+      if (!GEP->hasAllConstantIndices()) {
+        if (!isa<VectorType>(PtrTy->getElementType()))
+          return false;
+        if (NonConstantIdx)
+          return false;
+        GEPNonConstantIdx = Indices.pop_back_val();
+        if (!GEPNonConstantIdx->getType()->isIntegerTy(32))
+          return false;
+        HadDynamicAccess = true;
+      } else
+        GEPNonConstantIdx = NonConstantIdx;
+      uint64_t GEPOffset = TD.getIndexedOffset(PtrTy,
                                                Indices);
       // See if all uses can be converted.
-      if (!CanConvertToScalar(GEP, Offset+GEPOffset))
+      if (!CanConvertToScalar(GEP, Offset+GEPOffset, GEPNonConstantIdx))
         return false;
       IsNotTrivial = true;  // Can't be mem2reg'd.
       HadNonMemTransferAccess = true;
@@ -469,6 +494,9 @@
     // If this is a constant sized memset of a constant value (e.g. 0) we can
     // handle it.
     if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
+      // Store to dynamic index.
+      if (NonConstantIdx)
+        return false;
       // Store of constant value.
       if (!isa<ConstantInt>(MSI->getValue()))
         return false;
@@ -493,6 +521,9 @@
     // If this is a memcpy or memmove into or out of the whole allocation, we
     // can handle it like a load or store of the scalar type.
     if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) {
+      // Store to dynamic index.
+      if (NonConstantIdx)
+        return false;
       ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength());
       if (Len == 0 || Len->getZExtValue() != AllocaSize || Offset != 0)
         return false;
@@ -524,12 +555,13 @@
 /// Offset is an offset from the original alloca, in bits that need to be
 /// shifted to the right.  By the end of this, there should be no uses of Ptr.
 void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
-                                              uint64_t Offset) {
+                                              uint64_t Offset,
+                                              Value* NonConstantIdx) {
   while (!Ptr->use_empty()) {
     Instruction *User = cast<Instruction>(Ptr->use_back());
 
     if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) {
-      ConvertUsesToScalar(CI, NewAI, Offset);
+      ConvertUsesToScalar(CI, NewAI, Offset, NonConstantIdx);
       CI->eraseFromParent();
       continue;
     }
@@ -537,9 +569,11 @@
     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
       // Compute the offset that this GEP adds to the pointer.
       SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
+      if (!GEP->hasAllConstantIndices())
+        NonConstantIdx = Indices.pop_back_val();
       uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(),
                                                Indices);
-      ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8);
+      ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8, NonConstantIdx);
       GEP->eraseFromParent();
       continue;
     }
@@ -550,7 +584,8 @@
       // The load is a bit extract from NewAI shifted right by Offset bits.
       Value *LoadedVal = Builder.CreateLoad(NewAI);
       Value *NewLoadVal
-        = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, Builder);
+        = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset,
+                                     NonConstantIdx, Builder);
       LI->replaceAllUsesWith(NewLoadVal);
       LI->eraseFromParent();
       continue;
@@ -560,7 +595,7 @@
       assert(SI->getOperand(0) != Ptr && "Consistency error!");
       Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in");
       Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset,
-                                             Builder);
+                                             NonConstantIdx, Builder);
       Builder.CreateStore(New, NewAI);
       SI->eraseFromParent();
 
@@ -575,6 +610,7 @@
     // transform it into a store of the expanded constant value.
     if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
       assert(MSI->getRawDest() == Ptr && "Consistency error!");
+      assert(!NonConstantIdx && "Cannot replace dynamic memset with insert");
       int64_t SNumBytes = cast<ConstantInt>(MSI->getLength())->getSExtValue();
       if (SNumBytes > 0 && (SNumBytes >> 32) == 0) {
         unsigned NumBytes = static_cast<unsigned>(SNumBytes);
@@ -591,7 +627,7 @@
         Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in");
         Value *New = ConvertScalar_InsertValue(
                                     ConstantInt::get(User->getContext(), APVal),
-                                               Old, Offset, Builder);
+                                               Old, Offset, 0, Builder);
         Builder.CreateStore(New, NewAI);
 
         // If the load we just inserted is now dead, then the memset overwrote
@@ -607,6 +643,7 @@
     // can handle it like a load or store of the scalar type.
     if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) {
       assert(Offset == 0 && "must be store to start of alloca");
+      assert(!NonConstantIdx && "Cannot replace dynamic transfer with insert");
 
       // If the source and destination are both to the same alloca, then this is
       // a noop copy-to-self, just delete it.  Otherwise, emit a load and store
@@ -679,7 +716,8 @@
 /// shifted to the right.
 Value *ConvertToScalarInfo::
 ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
-                           uint64_t Offset, IRBuilder<> &Builder) {
+                           uint64_t Offset, Value* NonConstantIdx,
+                           IRBuilder<> &Builder) {
   // If the load is of the whole new alloca, no conversion is needed.
   Type *FromType = FromVal->getType();
   if (FromType == ToType && Offset == 0)
@@ -701,7 +739,17 @@
       assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
     }
     // Return the element extracted out of it.
-    Value *V = Builder.CreateExtractElement(FromVal, Builder.getInt32(Elt));
+    Value *Idx;
+    if (NonConstantIdx) {
+      if (Elt)
+        Idx = Builder.CreateAdd(NonConstantIdx,
+                                Builder.getInt32(Elt),
+                                "dyn.offset");
+      else
+        Idx = NonConstantIdx;
+    } else
+      Idx = Builder.getInt32(Elt);
+    Value *V = Builder.CreateExtractElement(FromVal, Idx);
     if (V->getType() != ToType)
       V = Builder.CreateBitCast(V, ToType);
     return V;
@@ -710,23 +758,27 @@
   // If ToType is a first class aggregate, extract out each of the pieces and
   // use insertvalue's to form the FCA.
   if (StructType *ST = dyn_cast<StructType>(ToType)) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into struct types not supported");
     const StructLayout &Layout = *TD.getStructLayout(ST);
     Value *Res = UndefValue::get(ST);
     for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
       Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i),
                                         Offset+Layout.getElementOffsetInBits(i),
-                                              Builder);
+                                              0, Builder);
       Res = Builder.CreateInsertValue(Res, Elt, i);
     }
     return Res;
   }
 
   if (ArrayType *AT = dyn_cast<ArrayType>(ToType)) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into array types not supported");
     uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType());
     Value *Res = UndefValue::get(AT);
     for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
       Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(),
-                                              Offset+i*EltSize, Builder);
+                                              Offset+i*EltSize, 0, Builder);
       Res = Builder.CreateInsertValue(Res, Elt, i);
     }
     return Res;
@@ -792,9 +844,14 @@
 ///
 /// Offset is an offset from the original alloca, in bits that need to be
 /// shifted to the right.
+///
+/// NonConstantIdx is an index value if there was a GEP with a non-constant
+/// index value.  If this is 0 then all GEPs used to find this insert address
+/// are constant.
 Value *ConvertToScalarInfo::
 ConvertScalar_InsertValue(Value *SV, Value *Old,
-                          uint64_t Offset, IRBuilder<> &Builder) {
+                          uint64_t Offset, Value* NonConstantIdx,
+                          IRBuilder<> &Builder) {
   // Convert the stored type to the actual type, shift it left to insert
   // then 'or' into place.
   Type *AllocaType = Old->getType();
@@ -815,26 +872,40 @@
       SV = Builder.CreateBitCast(SV, EltTy);
     uint64_t EltSize = TD.getTypeAllocSizeInBits(EltTy);
     unsigned Elt = Offset/EltSize;
-    return Builder.CreateInsertElement(Old, SV, Builder.getInt32(Elt));
+    Value *Idx;
+    if (NonConstantIdx) {
+      if (Elt)
+        Idx = Builder.CreateAdd(NonConstantIdx,
+                                Builder.getInt32(Elt),
+                                "dyn.offset");
+      else
+        Idx = NonConstantIdx;
+    } else
+      Idx = Builder.getInt32(Elt);
+    return Builder.CreateInsertElement(Old, SV, Idx);
   }
 
   // If SV is a first-class aggregate value, insert each value recursively.
   if (StructType *ST = dyn_cast<StructType>(SV->getType())) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into struct types not supported");
     const StructLayout &Layout = *TD.getStructLayout(ST);
     for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
       Value *Elt = Builder.CreateExtractValue(SV, i);
       Old = ConvertScalar_InsertValue(Elt, Old,
                                       Offset+Layout.getElementOffsetInBits(i),
-                                      Builder);
+                                      0, Builder);
     }
     return Old;
   }
 
   if (ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into array types not supported");
     uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType());
     for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
       Value *Elt = Builder.CreateExtractValue(SV, i);
-      Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, Builder);
+      Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, 0, Builder);
     }
     return Old;
   }

Modified: llvm/trunk/test/Transforms/ScalarRepl/dynamic-vector-gep.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/ScalarRepl/dynamic-vector-gep.ll?rev=158623&r1=158622&r2=158623&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/ScalarRepl/dynamic-vector-gep.ll (original)
+++ llvm/trunk/test/Transforms/ScalarRepl/dynamic-vector-gep.ll Sat Jun 16 22:58:26 2012
@@ -4,12 +4,14 @@
 target triple = "x86_64-apple-darwin10.0.0"
 
 ; CHECK: @test1
-; CHECK: %[[alloc0:[\.a-z0-9]*]] = alloca <4 x float>
-; CHECK: %[[alloc1:[\.a-z0-9]*]] = alloca <4 x float>
-; CHECK: store <4 x float> zeroinitializer, <4 x float>* %[[alloc0]]
+; CHECK: %[[alloc:[\.a-z0-9]*]] = alloca <4 x float>
+; CHECK: store <4 x float> zeroinitializer, <4 x float>* %[[alloc]]
+; CHECK: memset
+; CHECK: extractelement <4 x float> zeroinitializer, i32 %idx2
 
 ; Split the array but don't replace the memset with an insert
 ; element as its not a constant offset.
+; The load, however, can be replaced with an extract element.
 define float @test1(i32 %idx1, i32 %idx2) {
 entry:
   %0 = alloca [4 x <4 x float>]
@@ -23,13 +25,8 @@
 }
 
 ; CHECK: @test2
-; CHECK: %[[alloc:[\.a-z0-9]*]] = alloca <4 x float>
-; CHECK: store <4 x float> zeroinitializer, <4 x float>* %[[alloc]]
-; CHECK: %ptr1 = getelementptr inbounds <4 x float>* %[[alloc]], i32 0, i32 %idx1
-; CHECK: store float 1.000000e+00, float* %ptr1
-; CHECK: %ptr2 = getelementptr inbounds <4 x float>* %[[alloc]], i32 0, i32 %idx2
-; CHECK: %ret = load float* %ptr2
-; CHECK: ret float %ret
+; CHECK: %[[ins:[\.a-z0-9]*]] = insertelement <4 x float> zeroinitializer, float 1.000000e+00, i32 %idx1
+; CHECK: extractelement <4 x float> %[[ins]], i32 %idx2
 
 ; Do SROA on the array when it has dynamic vector reads and writes.
 define float @test2(i32 %idx1, i32 %idx2) {
@@ -61,13 +58,34 @@
   ret float %ret
 }
 
-; CHECK: @test4
+; CHECK: test4
+; CHECK: insertelement <16 x float> zeroinitializer, float 1.000000e+00, i32 %idx1
+; CHECK: extractelement <16 x float> %0, i32 %idx2
+
+; Don't do SROA on a dynamically indexed vector when it spans
+; more than one array element of the alloca array it is within.
+; However, unlike test3, the store is on the vector type
+; so SROA will convert the large alloca into the large vector
+; type and do all accesses with insert/extract element
+define float @test4(i32 %idx1, i32 %idx2) {
+entry:
+  %0 = alloca [4 x <4 x float>]
+  %bigvec = bitcast [4 x <4 x float>]* %0 to <16 x float>*
+  store <16 x float> zeroinitializer, <16 x float>* %bigvec
+  %ptr1 = getelementptr <16 x float>* %bigvec, i32 0, i32 %idx1
+  store float 1.0, float* %ptr1
+  %ptr2 = getelementptr <16 x float>* %bigvec, i32 0, i32 %idx2
+  %ret = load float* %ptr2
+  ret float %ret
+}
+
+; CHECK: @test5
 ; CHECK: %0 = alloca [4 x <4 x float>]
 ; CHECK-NOT: alloca
 
 ; Don't do SROA as the is a second dynamically indexed array
 ; which may span multiple elements of the alloca.
-define float @test4(i32 %idx1, i32 %idx2) {
+define float @test5(i32 %idx1, i32 %idx2) {
 entry:
   %0 = alloca [4 x <4 x float>]
   store [4 x <4 x float>] zeroinitializer, [4 x <4 x float>]* %0
@@ -80,15 +98,9 @@
   ret float %ret
 }
 
-; CHECK: test5
-; CHECK: %[[alloc0:[\.a-z0-9]*]] = alloca <4 x float>
-; CHECK: %[[alloc1:[\.a-z0-9]*]] = alloca <4 x float>
-; CHECK: store <4 x float> zeroinitializer, <4 x float>* %[[alloc0]]
-; CHECK: store <4 x float> zeroinitializer, <4 x float>* %[[alloc1]]
-; CHECK: %ptr1 = getelementptr inbounds <4 x float>* %[[alloc0]], i32 0, i32 %idx1
-; CHECK: store float 1.000000e+00, float* %ptr1
-; CHECK: %ptr2 = getelementptr inbounds <4 x float>* %[[alloc1]], i32 0, i32 %idx2
-; CHECK: %ret = load float* %ptr2
+; CHECK: test6
+; CHECK: insertelement <4 x float> zeroinitializer, float 1.000000e+00, i32 %idx1
+; CHECK: extractelement <4 x float> zeroinitializer, i32 %idx2
 
 %vector.pair = type { %vector.anon, %vector.anon }
 %vector.anon = type { %vector }
@@ -99,7 +111,7 @@
 ; the original GEP, just the indices it needs to get to the correct offset of
 ; some type, not necessarily the dynamic vector.
 ; This test makes sure we don't have this crash.
-define float @test5(i32 %idx1, i32 %idx2) {
+define float @test6(i32 %idx1, i32 %idx2) {
 entry:
   %0 = alloca %vector.pair
   store %vector.pair zeroinitializer, %vector.pair* %0
@@ -110,21 +122,15 @@
   ret float %ret
 }
 
-; CHECK: test6
-; CHECK: %[[alloc0:[\.a-z0-9]*]] = alloca <4 x float>
-; CHECK: %[[alloc1:[\.a-z0-9]*]] = alloca <4 x float>
-; CHECK: store <4 x float> zeroinitializer, <4 x float>* %[[alloc0]]
-; CHECK: store <4 x float> zeroinitializer, <4 x float>* %[[alloc1]]
-; CHECK: %ptr1 = getelementptr inbounds <4 x float>* %[[alloc0]], i32 0, i32 %idx1
-; CHECK: store float 1.000000e+00, float* %ptr1
-; CHECK: %ptr2 = getelementptr inbounds <4 x float>* %[[alloc1]], i32 0, i32 %idx2
-; CHECK: %ret = load float* %ptr2
+; CHECK: test7
+; CHECK: insertelement <4 x float> zeroinitializer, float 1.000000e+00, i32 %idx1
+; CHECK: extractelement <4 x float> zeroinitializer, i32 %idx2
 
 %array.pair = type { [2 x %array.anon], %array.anon }
 %array.anon = type { [2 x %vector] }
 
-; This is the same as test5 and tests the same crash, but on arrays.
-define float @test6(i32 %idx1, i32 %idx2) {
+; This is the same as test6 and tests the same crash, but on arrays.
+define float @test7(i32 %idx1, i32 %idx2) {
 entry:
   %0 = alloca %array.pair
   store %array.pair zeroinitializer, %array.pair* %0
@@ -135,4 +141,27 @@
   ret float %ret
 }
 
+; CHECK: test8
+; CHECK: %[[offset1:[\.a-z0-9]*]] = add i32 %idx1, 1
+; CHECK: %[[ins:[\.a-z0-9]*]] = insertelement <4 x float> zeroinitializer, float 1.000000e+00, i32 %[[offset1]]
+; CHECK: %[[offset2:[\.a-z0-9]*]] = add i32 %idx2, 2
+; CHECK: extractelement <4 x float> %[[ins]], i32 %[[offset2]]
+
+; Do SROA on the vector when it has dynamic vector reads and writes
+; from a non-zero offset.
+define float @test8(i32 %idx1, i32 %idx2) {
+entry:
+  %0 = alloca <4 x float>
+  store <4 x float> zeroinitializer, <4 x float>* %0
+  %ptr1 = getelementptr <4 x float>* %0, i32 0, i32 1
+  %ptr2 = bitcast float* %ptr1 to <3 x float>*
+  %ptr3 = getelementptr <3 x float>* %ptr2, i32 0, i32 %idx1
+  store float 1.0, float* %ptr3
+  %ptr4 = getelementptr <4 x float>* %0, i32 0, i32 2
+  %ptr5 = bitcast float* %ptr4 to <2 x float>*
+  %ptr6 = getelementptr <2 x float>* %ptr5, i32 0, i32 %idx2
+  %ret = load float* %ptr6
+  ret float %ret
+}
+
 declare void @llvm.memset.p0i8.i32(i8*, i8, i32, i32, i1)