[llvm] 6e7eeb4 - [GVN] Fix VNCoercion for Scalable Vector.

Fri Apr 10 17:49:25 PDT 2020

Author: Huihui Zhang
Date: 2020-04-10T17:49:07-07:00
New Revision: 6e7eeb44b305391f736437d050729b09c02fda0f

URL: https://github.com/llvm/llvm-project/commit/6e7eeb44b305391f736437d050729b09c02fda0f
DIFF: https://github.com/llvm/llvm-project/commit/6e7eeb44b305391f736437d050729b09c02fda0f.diff

LOG: [GVN] Fix VNCoercion for Scalable Vector.

Summary:
For VNCoercion, skip scalable vector when analysis rely on fixed size,
otherwise call TypeSize::getFixedSize() explicitly.

Add unit tests to check funtionality of GVN load elimination for scalable type.

Reviewers: sdesmalen, efriedma, spatel, fhahn, reames, apazos, ctetreau

Reviewed By: efriedma

Subscribers: bjope, hiraditya, jfb, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D76944

Added: 
    llvm/test/Transforms/GVN/vscale.ll

Modified: 
    llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
    llvm/lib/Transforms/Utils/VNCoercion.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 687ac25dd028..4f86cdd9bb8d 100644

--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1902,7 +1902,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     // If the element type has zero size then any index over it is equivalent
     // to an index of zero, so replace it with zero if it is not zero already.
     Type *EltTy = GTI.getIndexedType();
-    if (EltTy->isSized() && DL.getTypeAllocSize(EltTy) == 0)
+    if (EltTy->isSized() && DL.getTypeAllocSize(EltTy).isZero())
       if (!isa<Constant>(*I) || !match(I->get(), m_Zero())) {
         *I = Constant::getNullValue(NewIndexType);
         MadeChange = true;

diff  --git a/llvm/lib/Transforms/Utils/VNCoercion.cpp b/llvm/lib/Transforms/Utils/VNCoercion.cpp
index 23fb770400d5..0c05c2969603 100644
--- a/llvm/lib/Transforms/Utils/VNCoercion.cpp
+++ b/llvm/lib/Transforms/Utils/VNCoercion.cpp
@@ -10,6 +10,11 @@
 namespace llvm {
 namespace VNCoercion {
 
+static bool isFirstClassAggregateOrScalableType(Type *Ty) {
+  return Ty->isStructTy() || Ty->isArrayTy() ||
+         (Ty->isVectorTy() && Ty->getVectorIsScalable());
+}
+
 /// Return true if coerceAvailableValueToLoadType will succeed.
 bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
                                      const DataLayout &DL) {
@@ -17,20 +22,20 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
   if (StoredTy == LoadTy)
     return true;
 
-  // If the loaded or stored value is an first class array or struct, don't try
-  // to transform them.  We need to be able to bitcast to integer.
-  if (LoadTy->isStructTy() || LoadTy->isArrayTy() || StoredTy->isStructTy() ||
-      StoredTy->isArrayTy())
+  // If the loaded/stored value is a first class array/struct, or scalable type,
+  // don't try to transform them. We need to be able to bitcast to integer.
+  if (isFirstClassAggregateOrScalableType(LoadTy) ||
+      isFirstClassAggregateOrScalableType(StoredTy))
     return false;
 
-  uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy);
+  uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy).getFixedSize();
 
   // The store size must be byte-aligned to support future type casts.
   if (llvm::alignTo(StoreSize, 8) != StoreSize)
     return false;
 
   // The store has to be at least as big as the load.
-  if (StoreSize < DL.getTypeSizeInBits(LoadTy))
+  if (StoreSize < DL.getTypeSizeInBits(LoadTy).getFixedSize())
     return false;
 
   // Don't coerce non-integral pointers to integers or vice versa.
@@ -59,8 +64,8 @@ static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy,
   // If this is already the right type, just return it.
   Type *StoredValTy = StoredVal->getType();
 
-  uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy);
-  uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
+  uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy).getFixedSize();
+  uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy).getFixedSize();
 
   // If the store and reload are the same size, we can always reuse it.
   if (StoredValSize == LoadedValSize) {
@@ -112,8 +117,8 @@ static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy,
   // If this is a big-endian system, we need to shift the value down to the low
   // bits so that a truncate will work.
   if (DL.isBigEndian()) {
-    uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy) -
-                        DL.getTypeStoreSizeInBits(LoadedTy);
+    uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy).getFixedSize() -
+                        DL.getTypeStoreSizeInBits(LoadedTy).getFixedSize();
     StoredVal = Helper.CreateLShr(
         StoredVal, ConstantInt::get(StoredVal->getType(), ShiftAmt));
   }
@@ -161,9 +166,9 @@ static int analyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
                                           Value *WritePtr,
                                           uint64_t WriteSizeInBits,
                                           const DataLayout &DL) {
-  // If the loaded or stored value is a first class array or struct, don't try
-  // to transform them.  We need to be able to bitcast to integer.
-  if (LoadTy->isStructTy() || LoadTy->isArrayTy())
+  // If the loaded/stored value is a first class array/struct, or scalable type,
+  // don't try to transform them. We need to be able to bitcast to integer.
+  if (isFirstClassAggregateOrScalableType(LoadTy))
     return -1;
 
   int64_t StoreOffset = 0, LoadOffset = 0;
@@ -181,7 +186,7 @@ static int analyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
   // If the load and store don't overlap at all, the store doesn't provide
   // anything to the load.  In this case, they really don't alias at all, AA
   // must have gotten confused.
-  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy);
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize();
 
   if ((WriteSizeInBits & 7) | (LoadSize & 7))
     return -1;
@@ -215,10 +220,9 @@ static int analyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
 int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
                                    StoreInst *DepSI, const DataLayout &DL) {
   auto *StoredVal = DepSI->getValueOperand();
-  
-  // Cannot handle reading from store of first-class aggregate yet.
-  if (StoredVal->getType()->isStructTy() ||
-      StoredVal->getType()->isArrayTy())
+
+  // Cannot handle reading from store of first-class aggregate or scalable type.
+  if (isFirstClassAggregateOrScalableType(StoredVal->getType()))
     return -1;
 
   // Don't coerce non-integral pointers to integers or vice versa.
@@ -232,7 +236,7 @@ int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
 
   Value *StorePtr = DepSI->getPointerOperand();
   uint64_t StoreSize =
-      DL.getTypeSizeInBits(DepSI->getValueOperand()->getType());
+      DL.getTypeSizeInBits(DepSI->getValueOperand()->getType()).getFixedSize();
   return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, StorePtr, StoreSize,
                                         DL);
 }
@@ -337,7 +341,7 @@ int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI,
     return -1;
 
   Value *DepPtr = DepLI->getPointerOperand();
-  uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType());
+  uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType()).getFixedSize();
   int R = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
   if (R != -1)
     return R;
@@ -347,7 +351,7 @@ int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI,
   int64_t LoadOffs = 0;
   const Value *LoadBase =
       GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize();
 
   unsigned Size =
       getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI);
@@ -437,8 +441,9 @@ static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy,
     return SrcVal;
   }
 
-  uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
-  uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;
+  uint64_t StoreSize =
+      (DL.getTypeSizeInBits(SrcVal->getType()).getFixedSize() + 7) / 8;
+  uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy).getFixedSize() + 7) / 8;
   // Compute which bits of the stored value are being used by the load.  Convert
   // to an integer type to start with.
   if (SrcVal->getType()->isPtrOrPtrVectorTy())
@@ -490,8 +495,9 @@ Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy,
                            Instruction *InsertPt, const DataLayout &DL) {
   // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
   // widen SrcVal out to a larger load.
-  unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType());
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+  unsigned SrcValStoreSize =
+      DL.getTypeStoreSize(SrcVal->getType()).getFixedSize();
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize();
   if (Offset + LoadSize > SrcValStoreSize) {
     assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
     assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
@@ -534,8 +540,9 @@ Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy,
 
 Constant *getConstantLoadValueForLoad(Constant *SrcVal, unsigned Offset,
                                       Type *LoadTy, const DataLayout &DL) {
-  unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType());
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+  unsigned SrcValStoreSize =
+      DL.getTypeStoreSize(SrcVal->getType()).getFixedSize();
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize();
   if (Offset + LoadSize > SrcValStoreSize)
     return nullptr;
   return getConstantStoreValueForLoad(SrcVal, Offset, LoadTy, DL);
@@ -546,7 +553,7 @@ T *getMemInstValueForLoadHelper(MemIntrinsic *SrcInst, unsigned Offset,
                                 Type *LoadTy, HelperClass &Helper,
                                 const DataLayout &DL) {
   LLVMContext &Ctx = LoadTy->getContext();
-  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy) / 8;
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize() / 8;
 
   // We know that this method is only called when the mem transfer fully
   // provides the bits for the load.

diff  --git a/llvm/test/Transforms/GVN/vscale.ll b/llvm/test/Transforms/GVN/vscale.ll
new file mode 100644
index 000000000000..da98fd89f809
--- /dev/null
+++ b/llvm/test/Transforms/GVN/vscale.ll
@@ -0,0 +1,344 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S < %s -basicaa -gvn -dce | FileCheck %s
+
+; Analyze Load from clobbering Load.
+
+define <vscale x 4 x i32> @load_store_clobber_load(<vscale x 4 x i32> *%p)  {
+; CHECK-LABEL: @load_store_clobber_load(
+; CHECK-NEXT:    [[LOAD1:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* undef
+; CHECK-NEXT:    [[ADD:%.*]] = add <vscale x 4 x i32> [[LOAD1]], [[LOAD1]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
+;
+  %load1 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+  store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* undef
+  %load2 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p ; <- load to be eliminated
+  %add = add <vscale x 4 x i32> %load1, %load2
+  ret <vscale x 4 x i32> %add
+}
+
+define <vscale x 4 x i32> @load_store_clobber_load_mayalias(<vscale x 4 x i32>* %p, <vscale x 4 x i32>* %p2) {
+; CHECK-LABEL: @load_store_clobber_load_mayalias(
+; CHECK-NEXT:    [[LOAD1:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[P2:%.*]]
+; CHECK-NEXT:    [[LOAD2:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub <vscale x 4 x i32> [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[SUB]]
+;
+  %load1 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+  store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %p2
+  %load2 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+  %sub = sub <vscale x 4 x i32> %load1, %load2
+  ret <vscale x 4 x i32> %sub
+}
+
+define <vscale x 4 x i32> @load_store_clobber_load_noalias(<vscale x 4 x i32>* noalias %p, <vscale x 4 x i32>* noalias %p2) {
+; CHECK-LABEL: @load_store_clobber_load_noalias(
+; CHECK-NEXT:    [[LOAD1:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[P2:%.*]]
+; CHECK-NEXT:    [[ADD:%.*]] = add <vscale x 4 x i32> [[LOAD1]], [[LOAD1]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
+;
+  %load1 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+  store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %p2
+  %load2 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p ; <- load to be eliminated
+  %add = add <vscale x 4 x i32> %load1, %load2
+  ret <vscale x 4 x i32> %add
+}
+
+; TODO: BasicAA return MayAlias for %gep1,%gep2, could improve as MustAlias.
+define i32 @load_clobber_load_gep1(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @load_clobber_load_gep1(
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]], i64 0, i64 1
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, i32* [[GEP1]]
+; CHECK-NEXT:    [[P2:%.*]] = bitcast <vscale x 4 x i32>* [[P]] to i32*
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, i32* [[P2]], i64 1
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, i32* [[GEP2]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %gep1 = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 0, i64 1
+  %load1 = load i32, i32* %gep1
+  %p2 = bitcast <vscale x 4 x i32>* %p to i32*
+  %gep2 = getelementptr i32, i32* %p2, i64 1
+  %load2 = load i32, i32* %gep2 ; <- load could be eliminated
+  %add = add i32 %load1, %load2
+  ret i32 %add
+}
+
+define i32 @load_clobber_load_gep2(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @load_clobber_load_gep2(
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]], i64 1, i64 0
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, i32* [[GEP1]]
+; CHECK-NEXT:    [[P2:%.*]] = bitcast <vscale x 4 x i32>* [[P]] to i32*
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, i32* [[P2]], i64 4
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, i32* [[GEP2]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %gep1 = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 1, i64 0
+  %load1 = load i32, i32* %gep1
+  %p2 = bitcast <vscale x 4 x i32>* %p to i32*
+  %gep2 = getelementptr i32, i32* %p2, i64 4
+  %load2 = load i32, i32* %gep2 ; <- can not determine at compile-time if %load1 and %load2 are same addr
+  %add = add i32 %load1, %load2
+  ret i32 %add
+}
+
+; TODO: BasicAA return MayAlias for %gep1,%gep2, could improve as MustAlias.
+define i32 @load_clobber_load_gep3(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @load_clobber_load_gep3(
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]], i64 1, i64 0
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, i32* [[GEP1]]
+; CHECK-NEXT:    [[P2:%.*]] = bitcast <vscale x 4 x i32>* [[P]] to <vscale x 4 x float>*
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* [[P2]], i64 1, i64 0
+; CHECK-NEXT:    [[LOAD2:%.*]] = load float, float* [[GEP2]]
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast float [[LOAD2]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[LOAD1]], [[CAST]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %gep1 = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 1, i64 0
+  %load1 = load i32, i32* %gep1
+  %p2 = bitcast <vscale x 4 x i32>* %p to <vscale x 4 x float>*
+  %gep2 = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %p2, i64 1, i64 0
+  %load2 = load float, float* %gep2 ; <- load could be eliminated
+  %cast = bitcast float %load2 to i32
+  %add = add i32 %load1, %cast
+  ret i32 %add
+}
+
+define <vscale x 4 x i32> @load_clobber_load_fence(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @load_clobber_load_fence(
+; CHECK-NEXT:    [[LOAD1:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT:    call void asm "", "~{memory}"()
+; CHECK-NEXT:    [[LOAD2:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub <vscale x 4 x i32> [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[SUB]]
+;
+  %load1 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+  call void asm "", "~{memory}"()
+  %load2 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+  %sub = sub <vscale x 4 x i32> %load1, %load2
+  ret <vscale x 4 x i32> %sub
+}
+
+define <vscale x 4 x i32> @load_clobber_load_sideeffect(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @load_clobber_load_sideeffect(
+; CHECK-NEXT:    [[LOAD1:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT:    call void asm sideeffect "", ""()
+; CHECK-NEXT:    [[LOAD2:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]]
+; CHECK-NEXT:    [[ADD:%.*]] = add <vscale x 4 x i32> [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
+;
+  %load1 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+  call void asm sideeffect "", ""()
+  %load2 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+  %add = add <vscale x 4 x i32> %load1, %load2
+  ret <vscale x 4 x i32> %add
+}
+
+; Analyze Load from clobbering Store.
+
+define <vscale x 4 x i32> @store_forward_to_load(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @store_forward_to_load(
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
+;
+  store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %p
+  %load = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+  ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 4 x i32> @store_forward_to_load_sideeffect(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @store_forward_to_load_sideeffect(
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT:    call void asm sideeffect "", ""()
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[LOAD]]
+;
+  store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %p
+  call void asm sideeffect "", ""()
+  %load = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+  ret <vscale x 4 x i32> %load
+}
+
+define i32 @store_clobber_load() {
+; CHECK-LABEL: @store_clobber_load(
+; CHECK-NEXT:    [[ALLOC:%.*]] = alloca <vscale x 4 x i32>
+; CHECK-NEXT:    store <vscale x 4 x i32> undef, <vscale x 4 x i32>* [[ALLOC]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[ALLOC]], i32 0, i32 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[PTR]]
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+  %alloc = alloca <vscale x 4 x i32>
+  store <vscale x 4 x i32> undef, <vscale x 4 x i32>* %alloc
+  %ptr = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %alloc, i32 0, i32 1
+  %load = load i32, i32* %ptr
+  ret i32 %load
+}
+
+; Analyze Load from clobbering MemInst.
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1)
+
+define i32 @memset_clobber_load(<vscale x 4 x i32> *%p) {
+; CHECK-LABEL: @memset_clobber_load(
+; CHECK-NEXT:    [[CONV:%.*]] = bitcast <vscale x 4 x i32>* [[P:%.*]] to i8*
+; CHECK-NEXT:    tail call void @llvm.memset.p0i8.i64(i8* [[CONV]], i8 1, i64 200, i1 false)
+; CHECK-NEXT:    ret i32 16843009
+;
+  %conv = bitcast <vscale x 4 x i32>* %p to i8*
+  tail call void @llvm.memset.p0i8.i64(i8* %conv, i8 1, i64 200, i1 false)
+  %gep = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 0, i64 5
+  %load = load i32, i32* %gep
+  ret i32 %load
+}
+
+define i32 @memset_clobber_load_vscaled_base(<vscale x 4 x i32> *%p) {
+; CHECK-LABEL: @memset_clobber_load_vscaled_base(
+; CHECK-NEXT:    [[CONV:%.*]] = bitcast <vscale x 4 x i32>* [[P:%.*]] to i8*
+; CHECK-NEXT:    tail call void @llvm.memset.p0i8.i64(i8* [[CONV]], i8 1, i64 200, i1 false)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]], i64 1, i64 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[GEP]]
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+  %conv = bitcast <vscale x 4 x i32>* %p to i8*
+  tail call void @llvm.memset.p0i8.i64(i8* %conv, i8 1, i64 200, i1 false)
+  %gep = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 1, i64 1
+  %load = load i32, i32* %gep
+  ret i32 %load
+}
+
+define i32 @memset_clobber_load_nonconst_index(<vscale x 4 x i32> *%p, i64 %idx1, i64 %idx2) {
+; CHECK-LABEL: @memset_clobber_load_nonconst_index(
+; CHECK-NEXT:    [[CONV:%.*]] = bitcast <vscale x 4 x i32>* [[P:%.*]] to i8*
+; CHECK-NEXT:    tail call void @llvm.memset.p0i8.i64(i8* [[CONV]], i8 1, i64 200, i1 false)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]], i64 [[IDX1:%.*]], i64 [[IDX2:%.*]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[GEP]]
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+  %conv = bitcast <vscale x 4 x i32>* %p to i8*
+  tail call void @llvm.memset.p0i8.i64(i8* %conv, i8 1, i64 200, i1 false)
+  %gep = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 %idx1, i64 %idx2
+  %load = load i32, i32* %gep
+  ret i32 %load
+}
+
+
+; Load elimination across BBs
+
+define <vscale x 4 x i32>* @load_from_alloc_replaced_with_undef() {
+; CHECK-LABEL: @load_from_alloc_replaced_with_undef(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca <vscale x 4 x i32>
+; CHECK-NEXT:    br i1 undef, label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[A]]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret <vscale x 4 x i32>* [[A]]
+;
+entry:
+  %a = alloca <vscale x 4 x i32>
+  %gep = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %a, i64 0, i64 1
+  %load = load i32, i32* %gep ; <- load to be eliminated
+  %tobool = icmp eq i32 %load, 0 ; <- icmp to be eliminated
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+  store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %a
+  br label %if.end
+
+if.end:
+  ret <vscale x 4 x i32>* %a
+}
+
+define i32 @redundant_load_elimination_1(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @redundant_load_elimination_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]], i64 1, i64 1
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, i32* [[GEP]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[LOAD1]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret i32 [[LOAD1]]
+;
+entry:
+  %gep = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 1, i64 1
+  %load1 = load i32, i32* %gep
+  %cmp = icmp eq i32 %load1, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %load2 = load i32, i32* %gep ; <- load to be eliminated
+  %add = add i32 %load1, %load2
+  br label %if.end
+
+if.end:
+  %result = phi i32 [ %add, %if.then ], [ %load1, %entry ]
+  ret i32 %result
+}
+
+; TODO: BasicAA return MayAlias for %gep1,%gep2, could improve as NoAlias.
+define void @redundant_load_elimination_2(i1 %c, <vscale x 4 x i32>* %p, i32* %q, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: @redundant_load_elimination_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]], i64 1, i64 1
+; CHECK-NEXT:    store i32 0, i32* [[GEP1]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]], i64 1, i64 0
+; CHECK-NEXT:    store i32 1, i32* [[GEP2]]
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[T:%.*]] = load i32, i32* [[GEP1]]
+; CHECK-NEXT:    store i32 [[T]], i32* [[Q:%.*]]
+; CHECK-NEXT:    ret void
+; CHECK:       if.else:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %gep1 = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 1, i64 1
+  store i32 0, i32* %gep1
+  %gep2 = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 1, i64 0
+  store i32 1, i32* %gep2
+  br i1 %c, label %if.else, label %if.then
+
+if.then:
+  %t = load i32, i32* %gep1 ; <- load could be eliminated
+  store i32 %t, i32* %q
+  ret void
+
+if.else:
+  ret void
+}
+
+; TODO: load in if.then could have been eliminated
+define void @missing_load_elimination(i1 %c, <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %q, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: @missing_load_elimination(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]], i64 1
+; CHECK-NEXT:    store <vscale x 4 x i32> [[V:%.*]], <vscale x 4 x i32>* [[P1]]
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[T:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[T]], <vscale x 4 x i32>* [[Q:%.*]]
+; CHECK-NEXT:    ret void
+; CHECK:       if.else:
+; CHECK-NEXT:    ret void
+;
+entry:
+  store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %p
+  %p1 = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 1
+  store <vscale x 4 x i32> %v, <vscale x 4 x i32>* %p1
+  br i1 %c, label %if.else, label %if.then
+
+if.then:
+  %t = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p ; load could be eliminated
+  store <vscale x 4 x i32> %t, <vscale x 4 x i32>* %q
+  ret void
+
+if.else:
+  ret void
+}