[llvm] 68c1eeb - [ArgPromotion] Make implementation offset based

Wed Feb 9 00:35:09 PST 2022

Author: Nikita Popov
Date: 2022-02-09T09:35:01+01:00
New Revision: 68c1eeb4bad18753dbaa053a6c919c8f1a23fb9c

URL: https://github.com/llvm/llvm-project/commit/68c1eeb4bad18753dbaa053a6c919c8f1a23fb9c
DIFF: https://github.com/llvm/llvm-project/commit/68c1eeb4bad18753dbaa053a6c919c8f1a23fb9c.diff

LOG: [ArgPromotion] Make implementation offset based

This rewrites ArgPromotion to be based on offsets rather than GEP
structure. We inspect all loads at constant offsets and remember
which types are loaded at which offsets. Then we promote based on
those types.

This generalizes ArgPromotion to work with bitcasted loads, and
is compatible with opaque pointers.

This patch also fixes incorrect handling of alignment during
argument promotion. Previously, the implementation only checked
that the pointer is dereferenceable, but was happy to speculate
overaligned loads. (I would have fixed this separately in advance,
but I found this hard to do with the previous implementation
approach).

Differential Revision: https://reviews.llvm.org/D118685

Added: 
    llvm/test/Transforms/ArgumentPromotion/opaque-ptr.ll

Modified: 
    llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
    llvm/test/Transforms/ArgumentPromotion/align.ll
    llvm/test/Transforms/ArgumentPromotion/bitcasts.ll
    llvm/test/Transforms/ArgumentPromotion/fp80.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index d1d880c4d068..c7b4c8a32186 100644

--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -91,43 +91,79 @@ using namespace llvm;
 #define DEBUG_TYPE "argpromotion"
 
 STATISTIC(NumArgumentsPromoted, "Number of pointer arguments promoted");
-STATISTIC(NumAggregatesPromoted, "Number of aggregate arguments promoted");
 STATISTIC(NumByValArgsPromoted, "Number of byval arguments promoted");
 STATISTIC(NumArgumentsDead, "Number of dead pointer args eliminated");
 
-/// A vector used to hold the indices of a single GEP instruction
-using IndicesVector = std::vector<uint64_t>;
+struct ArgPart {
+  Type *Ty;
+  Align Alignment;
+  /// A representative guaranteed-executed load instruction for use by
+  /// metadata transfer.
+  LoadInst *MustExecLoad;
+};
+using OffsetAndArgPart = std::pair<int64_t, ArgPart>;
+
+static Value *createByteGEP(IRBuilderBase &IRB, const DataLayout &DL,
+                            Value *Ptr, Type *ResElemTy, int64_t Offset) {
+  // For non-opaque pointers, try create a "nice" GEP if possible, otherwise
+  // fall back to an i8 GEP to a specific offset.
+  unsigned AddrSpace = Ptr->getType()->getPointerAddressSpace();
+  APInt OrigOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), Offset);
+  if (!Ptr->getType()->isOpaquePointerTy()) {
+    Type *OrigElemTy = Ptr->getType()->getNonOpaquePointerElementType();
+    if (OrigOffset == 0 && OrigElemTy == ResElemTy)
+      return Ptr;
+
+    if (OrigElemTy->isSized()) {
+      APInt TmpOffset = OrigOffset;
+      Type *TmpTy = OrigElemTy;
+      SmallVector<APInt> IntIndices =
+          DL.getGEPIndicesForOffset(TmpTy, TmpOffset);
+      if (TmpOffset == 0) {
+        // Try to add trailing zero indices to reach the right type.
+        while (TmpTy != ResElemTy) {
+          Type *NextTy = GetElementPtrInst::getTypeAtIndex(TmpTy, (uint64_t)0);
+          if (!NextTy)
+            break;
+
+          IntIndices.push_back(APInt::getZero(
+              isa<StructType>(TmpTy) ? 32 : OrigOffset.getBitWidth()));
+          TmpTy = NextTy;
+        }
+
+        SmallVector<Value *> Indices;
+        for (const APInt &Index : IntIndices)
+          Indices.push_back(IRB.getInt(Index));
+
+        if (OrigOffset != 0 || TmpTy == ResElemTy) {
+          Ptr = IRB.CreateGEP(OrigElemTy, Ptr, Indices);
+          return IRB.CreateBitCast(Ptr, ResElemTy->getPointerTo(AddrSpace));
+        }
+      }
+    }
+  }
+
+  if (OrigOffset != 0) {
+    Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(AddrSpace));
+    Ptr = IRB.CreateGEP(IRB.getInt8Ty(), Ptr, IRB.getInt(OrigOffset));
+  }
+  return IRB.CreateBitCast(Ptr, ResElemTy->getPointerTo(AddrSpace));
+}
 
 /// DoPromotion - This method actually performs the promotion of the specified
 /// arguments, and returns the new function.  At this point, we know that it's
 /// safe to do so.
-static Function *
-doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
-            SmallPtrSetImpl<Argument *> &ByValArgsToTransform,
-            Optional<function_ref<void(CallBase &OldCS, CallBase &NewCS)>>
-                ReplaceCallSite) {
+static Function *doPromotion(
+    Function *F,
+    const DenseMap<Argument *, SmallVector<OffsetAndArgPart, 4>> &ArgsToPromote,
+    SmallPtrSetImpl<Argument *> &ByValArgsToTransform,
+    Optional<function_ref<void(CallBase &OldCS, CallBase &NewCS)>>
+        ReplaceCallSite) {
   // Start by computing a new prototype for the function, which is the same as
   // the old function, but has modified arguments.
   FunctionType *FTy = F->getFunctionType();
   std::vector<Type *> Params;
 
-  using ScalarizeTable = std::set<std::pair<Type *, IndicesVector>>;
-
-  // ScalarizedElements - If we are promoting a pointer that has elements
-  // accessed out of it, keep track of which elements are accessed so that we
-  // can add one argument for each.
-  //
-  // Arguments that are directly loaded will have a zero element value here, to
-  // handle cases where there are both a direct load and GEP accesses.
-  std::map<Argument *, ScalarizeTable> ScalarizedElements;
-
-  // OriginalLoads - Keep track of a representative load instruction from the
-  // original function so that we can tell the alias analysis implementation
-  // what the new GEP/Load instructions we are inserting look like.
-  // We need to keep the original loads for each argument and the elements
-  // of the argument that are accessed.
-  std::map<std::pair<Argument *, IndicesVector>, LoadInst *> OriginalLoads;
-
   // Attribute - Keep track of the parameter attributes for the arguments
   // that we are *not* promoting. For the ones that we do promote, the parameter
   // attributes are lost
@@ -154,58 +190,12 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
       // Dead argument (which are always marked as promotable)
       ++NumArgumentsDead;
     } else {
-      // Okay, this is being promoted. This means that the only uses are loads
-      // or GEPs which are only used by loads
-
-      // In this table, we will track which indices are loaded from the argument
-      // (where direct loads are tracked as no indices).
-      ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
-      for (User *U : make_early_inc_range(I->users())) {
-        Instruction *UI = cast<Instruction>(U);
-        Type *SrcTy;
-        if (LoadInst *L = dyn_cast<LoadInst>(UI))
-          SrcTy = L->getType();
-        else
-          SrcTy = cast<GetElementPtrInst>(UI)->getSourceElementType();
-        // Skip dead GEPs and remove them.
-        if (isa<GetElementPtrInst>(UI) && UI->use_empty()) {
-          UI->eraseFromParent();
-          continue;
-        }
-
-        IndicesVector Indices;
-        Indices.reserve(UI->getNumOperands() - 1);
-        // Since loads will only have a single operand, and GEPs only a single
-        // non-index operand, this will record direct loads without any indices,
-        // and gep+loads with the GEP indices.
-        for (const Use &I : llvm::drop_begin(UI->operands()))
-          Indices.push_back(cast<ConstantInt>(I)->getSExtValue());
-        // GEPs with a single 0 index can be merged with direct loads
-        if (Indices.size() == 1 && Indices.front() == 0)
-          Indices.clear();
-        ArgIndices.insert(std::make_pair(SrcTy, Indices));
-        LoadInst *OrigLoad;
-        if (LoadInst *L = dyn_cast<LoadInst>(UI))
-          OrigLoad = L;
-        else
-          // Take any load, we will use it only to update Alias Analysis
-          OrigLoad = cast<LoadInst>(UI->user_back());
-        OriginalLoads[std::make_pair(&*I, Indices)] = OrigLoad;
-      }
-
-      // Add a parameter to the function for each element passed in.
-      for (const auto &ArgIndex : ArgIndices) {
-        // not allowed to dereference ->begin() if size() is 0
-        Params.push_back(GetElementPtrInst::getIndexedType(
-            I->getType()->getPointerElementType(), ArgIndex.second));
+      const auto &ArgParts = ArgsToPromote.find(&*I)->second;
+      for (const auto &Pair : ArgParts) {
+        Params.push_back(Pair.second.Ty);
         ArgAttrVec.push_back(AttributeSet());
-        assert(Params.back());
       }
-
-      if (ArgIndices.size() == 1 && ArgIndices.begin()->second.empty())
-        ++NumArgumentsPromoted;
-      else
-        ++NumAggregatesPromoted;
+      ++NumArgumentsPromoted;
     }
   }
 
@@ -277,44 +267,18 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
           ArgAttrVec.push_back(AttributeSet());
         }
       } else if (!I->use_empty()) {
-        // Non-dead argument: insert GEPs and loads as appropriate.
-        ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
-        // Store the Value* version of the indices in here, but declare it now
-        // for reuse.
-        std::vector<Value *> Ops;
-        for (const auto &ArgIndex : ArgIndices) {
-          Value *V = *AI;
-          LoadInst *OrigLoad =
-              OriginalLoads[std::make_pair(&*I, ArgIndex.second)];
-          if (!ArgIndex.second.empty()) {
-            Ops.reserve(ArgIndex.second.size());
-            Type *ElTy = V->getType();
-            for (auto II : ArgIndex.second) {
-              // Use i32 to index structs, and i64 for others (pointers/arrays).
-              // This satisfies GEP constraints.
-              Type *IdxTy =
-                  (ElTy->isStructTy() ? Type::getInt32Ty(F->getContext())
-                                      : Type::getInt64Ty(F->getContext()));
-              Ops.push_back(ConstantInt::get(IdxTy, II));
-              // Keep track of the type we're currently indexing.
-              if (auto *ElPTy = dyn_cast<PointerType>(ElTy))
-                ElTy = ElPTy->getPointerElementType();
-              else
-                ElTy = GetElementPtrInst::getTypeAtIndex(ElTy, II);
-            }
-            // And create a GEP to extract those indices.
-            V = IRB.CreateGEP(ArgIndex.first, V, Ops, V->getName() + ".idx");
-            Ops.clear();
+        Value *V = *AI;
+        const auto &ArgParts = ArgsToPromote.find(&*I)->second;
+        for (const auto &Pair : ArgParts) {
+          LoadInst *LI = IRB.CreateAlignedLoad(
+              Pair.second.Ty,
+              createByteGEP(IRB, DL, V, Pair.second.Ty, Pair.first),
+              Pair.second.Alignment, V->getName() + ".val");
+          if (Pair.second.MustExecLoad) {
+            // TODO: Transfer other metadata like !nonnull here.
+            LI->setAAMetadata(Pair.second.MustExecLoad->getAAMetadata());
           }
-          // Since we're replacing a load make sure we take the alignment
-          // of the previous load.
-          LoadInst *newLoad =
-              IRB.CreateLoad(OrigLoad->getType(), V, V->getName() + ".val");
-          newLoad->setAlignment(OrigLoad->getAlign());
-          // Transfer the AA info too.
-          newLoad->setAAMetadata(OrigLoad->getAAMetadata());
-
-          Args.push_back(newLoad);
+          Args.push_back(LI);
           ArgAttrVec.push_back(AttributeSet());
         }
       }
@@ -416,57 +380,46 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
     if (Arg.use_empty())
       continue;
 
+    SmallDenseMap<int64_t, Argument *> OffsetToArg;
+    for (const auto &Pair : ArgsToPromote.find(&Arg)->second) {
+      Argument &NewArg = *I2++;
+      NewArg.setName(Arg.getName() + "." + Twine(Pair.first) + ".val");
+      OffsetToArg.insert({Pair.first, &NewArg});
+    }
+
     // Otherwise, if we promoted this argument, then all users are load
-    // instructions (or GEPs with only load users), and all loads should be
-    // using the new argument that we added.
-    ScalarizeTable &ArgIndices = ScalarizedElements[&Arg];
-
-    while (!Arg.use_empty()) {
-      if (LoadInst *LI = dyn_cast<LoadInst>(Arg.user_back())) {
-        assert(ArgIndices.begin()->second.empty() &&
-               "Load element should sort to front!");
-        I2->setName(Arg.getName() + ".val");
-        LI->replaceAllUsesWith(&*I2);
-        LI->eraseFromParent();
-        LLVM_DEBUG(dbgs() << "*** Promoted load of argument '" << Arg.getName()
-                          << "' in function '" << F->getName() << "'\n");
-      } else {
-        GetElementPtrInst *GEP = cast<GetElementPtrInst>(Arg.user_back());
-        assert(!GEP->use_empty() &&
-               "GEPs without uses should be cleaned up already");
-        IndicesVector Operands;
-        Operands.reserve(GEP->getNumIndices());
-        for (const Use &Idx : GEP->indices())
-          Operands.push_back(cast<ConstantInt>(Idx)->getSExtValue());
-
-        // GEPs with a single 0 index can be merged with direct loads
-        if (Operands.size() == 1 && Operands.front() == 0)
-          Operands.clear();
-
-        Function::arg_iterator TheArg = I2;
-        for (ScalarizeTable::iterator It = ArgIndices.begin();
-             It->second != Operands; ++It, ++TheArg) {
-          assert(It != ArgIndices.end() && "GEP not handled??");
-        }
+    // instructions (with possible casts and GEPs in between).
+
+    SmallVector<Value *, 16> Worklist;
+    SmallVector<Instruction *, 16> DeadInsts;
+    append_range(Worklist, Arg.users());
+    while (!Worklist.empty()) {
+      Value *V = Worklist.pop_back_val();
+      if (isa<BitCastInst>(V) || isa<GetElementPtrInst>(V)) {
+        DeadInsts.push_back(cast<Instruction>(V));
+        append_range(Worklist, V->users());
+        continue;
+      }
 
-        TheArg->setName(formatv("{0}.{1:$[.]}.val", Arg.getName(),
-                                make_range(Operands.begin(), Operands.end())));
+      if (auto *LI = dyn_cast<LoadInst>(V)) {
+        Value *Ptr = LI->getPointerOperand();
+        APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+        Ptr =
+            Ptr->stripAndAccumulateConstantOffsets(DL, Offset,
+                                                   /* AllowNonInbounds */ true);
+        assert(Ptr == &Arg && "Not constant offset from arg?");
+        LI->replaceAllUsesWith(OffsetToArg[Offset.getSExtValue()]);
+        DeadInsts.push_back(LI);
+        continue;
+      }
 
-        LLVM_DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName()
-                          << "' of function '" << NF->getName() << "'\n");
+      llvm_unreachable("Unexpected user");
+    }
 
-        // All of the uses must be load instructions.  Replace them all with
-        // the argument specified by ArgNo.
-        while (!GEP->use_empty()) {
-          LoadInst *L = cast<LoadInst>(GEP->user_back());
-          L->replaceAllUsesWith(&*TheArg);
-          L->eraseFromParent();
-        }
-        GEP->eraseFromParent();
-      }
+    for (Instruction *I : DeadInsts) {
+      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+      I->eraseFromParent();
     }
-    // Increment I2 past all of the arguments added for this promoted pointer.
-    std::advance(I2, ArgIndices.size());
   }
 
   return NF;
@@ -474,11 +427,12 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
 
 /// Return true if we can prove that all callees pass in a valid pointer for the
 /// specified function argument.
-static bool allCallersPassValidPointerForArgument(Argument *Arg, Type *Ty) {
+static bool allCallersPassValidPointerForArgument(Argument *Arg,
+                                                  Align NeededAlign,
+                                                  uint64_t NeededDerefBytes) {
   Function *Callee = Arg->getParent();
   const DataLayout &DL = Callee->getParent()->getDataLayout();
-  Align NeededAlign(1); // TODO: This is incorrect!
-  APInt Bytes(64, DL.getTypeStoreSize(Ty));
+  APInt Bytes(64, NeededDerefBytes);
 
   // Check if the argument itself is marked dereferenceable and aligned.
   if (isDereferenceableAndAlignedPointer(Arg, NeededAlign, Bytes, DL))
@@ -493,83 +447,17 @@ static bool allCallersPassValidPointerForArgument(Argument *Arg, Type *Ty) {
   });
 }
 
-/// Returns true if Prefix is a prefix of longer. That means, Longer has a size
-/// that is greater than or equal to the size of prefix, and each of the
-/// elements in Prefix is the same as the corresponding elements in Longer.
-///
-/// This means it also returns true when Prefix and Longer are equal!
-static bool isPrefix(const IndicesVector &Prefix, const IndicesVector &Longer) {
-  if (Prefix.size() > Longer.size())
-    return false;
-  return std::equal(Prefix.begin(), Prefix.end(), Longer.begin());
-}
-
-/// Checks if Indices, or a prefix of Indices, is in Set.
-static bool prefixIn(const IndicesVector &Indices,
-                     std::set<IndicesVector> &Set) {
-  std::set<IndicesVector>::iterator Low;
-  Low = Set.upper_bound(Indices);
-  if (Low != Set.begin())
-    Low--;
-  // Low is now the last element smaller than or equal to Indices. This means
-  // it points to a prefix of Indices (possibly Indices itself), if such
-  // prefix exists.
-  //
-  // This load is safe if any prefix of its operands is safe to load.
-  return Low != Set.end() && isPrefix(*Low, Indices);
-}
-
-/// Mark the given indices (ToMark) as safe in the given set of indices
-/// (Safe). Marking safe usually means adding ToMark to Safe. However, if there
-/// is already a prefix of Indices in Safe, Indices are implicitely marked safe
-/// already. Furthermore, any indices that Indices is itself a prefix of, are
-/// removed from Safe (since they are implicitely safe because of Indices now).
-static void markIndicesSafe(const IndicesVector &ToMark,
-                            std::set<IndicesVector> &Safe) {
-  std::set<IndicesVector>::iterator Low;
-  Low = Safe.upper_bound(ToMark);
-  // Guard against the case where Safe is empty
-  if (Low != Safe.begin())
-    Low--;
-  // Low is now the last element smaller than or equal to Indices. This
-  // means it points to a prefix of Indices (possibly Indices itself), if
-  // such prefix exists.
-  if (Low != Safe.end()) {
-    if (isPrefix(*Low, ToMark))
-      // If there is already a prefix of these indices (or exactly these
-      // indices) marked a safe, don't bother adding these indices
-      return;
-
-    // Increment Low, so we can use it as a "insert before" hint
-    ++Low;
-  }
-  // Insert
-  Low = Safe.insert(Low, ToMark);
-  ++Low;
-  // If there we're a prefix of longer index list(s), remove those
-  std::set<IndicesVector>::iterator End = Safe.end();
-  while (Low != End && isPrefix(ToMark, *Low)) {
-    std::set<IndicesVector>::iterator Remove = Low;
-    ++Low;
-    Safe.erase(Remove);
-  }
-}
-
-/// isSafeToPromoteArgument - As you might guess from the name of this method,
-/// it checks to see if it is both safe and useful to promote the argument.
-/// This method limits promotion of aggregates to only promote up to three
-/// elements of the aggregate in order to avoid exploding the number of
-/// arguments passed in.
-static bool isSafeToPromoteArgument(Argument *Arg, Type *ByValTy, AAResults &AAR,
-                                    unsigned MaxElements) {
-  using GEPIndicesSet = std::set<IndicesVector>;
-
+/// Determine that this argument is safe to promote, and find the argument
+/// parts it can be promoted into.
+static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
+                         unsigned MaxElements, bool IsSelfRecursive,
+                         SmallVectorImpl<OffsetAndArgPart> &ArgPartsVec) {
   // Quick exit for unused arguments
   if (Arg->use_empty())
     return true;
 
-  // We can only promote this argument if all of the uses are loads, or are GEP
-  // instructions (with constant indices) that are subsequently loaded.
+  // We can only promote this argument if all of the uses are loads at known
+  // offsets.
   //
   // Promoting the argument causes it to be loaded in the caller
   // unconditionally. This is only safe if we can prove that either the load
@@ -580,156 +468,164 @@ static bool isSafeToPromoteArgument(Argument *Arg, Type *ByValTy, AAResults &AAR
   // anyway, in the latter case, invalid loads won't happen. This prevents us
   // from introducing an invalid load that wouldn't have happened in the
   // original code.
-  //
-  // This set will contain all sets of indices that are loaded in the entry
-  // block, and thus are safe to unconditionally load in the caller.
-  GEPIndicesSet SafeToUnconditionallyLoad;
-
-  // This set contains all the sets of indices that we are planning to promote.
-  // This makes it possible to limit the number of arguments added.
-  GEPIndicesSet ToPromote;
-
-  // If the pointer is always valid, any load with first index 0 is valid.
-
-  if (ByValTy)
-    SafeToUnconditionallyLoad.insert(IndicesVector(1, 0));
-
-  // Whenever a new underlying type for the operand is found, make sure it's
-  // consistent with the GEPs and loads we've already seen and, if necessary,
-  // use it to see if all incoming pointers are valid (which implies the 0-index
-  // is safe).
-  Type *BaseTy = ByValTy;
-  auto UpdateBaseTy = [&](Type *NewBaseTy) {
-    if (BaseTy)
-      return BaseTy == NewBaseTy;
-
-    BaseTy = NewBaseTy;
-    if (allCallersPassValidPointerForArgument(Arg, BaseTy)) {
-      assert(SafeToUnconditionallyLoad.empty());
-      SafeToUnconditionallyLoad.insert(IndicesVector(1, 0));
+
+  SmallDenseMap<int64_t, ArgPart, 4> ArgParts;
+  Align NeededAlign(1);
+  uint64_t NeededDerefBytes = 0;
+
+  // Returns None if this load is not based on the argument. Return true if
+  // we can promote the load, false otherwise.
+  auto HandleLoad = [&](LoadInst *LI,
+                        bool GuaranteedToExecute) -> Optional<bool> {
+    // Don't promote volatile or atomic loads.
+    if (!LI->isSimple())
+      return false;
+
+    Value *Ptr = LI->getPointerOperand();
+    APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+    Ptr = Ptr->stripAndAccumulateConstantOffsets(DL, Offset,
+                                                 /* AllowNonInbounds */ true);
+    if (Ptr != Arg)
+      return None;
+
+    if (Offset.getSignificantBits() >= 64)
+      return false;
+
+    Type *Ty = LI->getType();
+    TypeSize Size = DL.getTypeStoreSize(Ty);
+    // Don't try to promote scalable types.
+    if (Size.isScalable())
+      return false;
+
+    // If this is a self-recursive function and one of the types is a pointer,
+    // then promoting it might lead to recursive promotion.
+    if (IsSelfRecursive && Ty->isPointerTy())
+      return false;
+
+    int64_t Off = Offset.getSExtValue();
+    auto Pair = ArgParts.try_emplace(
+        Off, ArgPart{Ty, LI->getAlign(), GuaranteedToExecute ? LI : nullptr});
+    ArgPart &Part = Pair.first->second;
+    bool OffsetNotSeenBefore = Pair.second;
+
+    // We limit promotion to only promoting up to a fixed number of elements of
+    // the aggregate.
+    if (MaxElements > 0 && ArgParts.size() >= MaxElements) {
+      LLVM_DEBUG(dbgs() << "ArgPromotion of " << *Arg << " failed: "
+                        << "more than " << MaxElements << " parts\n");
+      return false;
     }
 
-    return true;
-  };
+    // For now, we only support loading one specific type at a given offset.
+    if (Part.Ty != Ty) {
+      LLVM_DEBUG(dbgs() << "ArgPromotion of " << *Arg << " failed: "
+                        << "loaded via both " << *Part.Ty << " and " << *Ty
+                        << " at offset " << Off << "\n");
+      return false;
+    }
 
-  // First, iterate functions that are guaranteed to execution on function
-  // entry and mark loads of (geps of) arguments as safe.
-  BasicBlock &EntryBlock = Arg->getParent()->front();
-  // Declare this here so we can reuse it
-  IndicesVector Indices;
-  for (Instruction &I : EntryBlock) {
-    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
-      Value *V = LI->getPointerOperand();
-      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) {
-        V = GEP->getPointerOperand();
-        if (V == Arg) {
-          // This load actually loads (part of) Arg? Check the indices then.
-          Indices.reserve(GEP->getNumIndices());
-          for (Use &Idx : GEP->indices())
-            if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
-              Indices.push_back(CI->getSExtValue());
-            else
-              // We found a non-constant GEP index for this argument? Bail out
-              // right away, can't promote this argument at all.
-              return false;
-
-          if (!UpdateBaseTy(GEP->getSourceElementType()))
-            return false;
-
-          // Indices checked out, mark them as safe
-          markIndicesSafe(Indices, SafeToUnconditionallyLoad);
-          Indices.clear();
-        }
-      } else if (V == Arg) {
-        // Direct loads are equivalent to a GEP with a single 0 index.
-        markIndicesSafe(IndicesVector(1, 0), SafeToUnconditionallyLoad);
+    // If this load is not guaranteed to execute and we haven't seen a load at
+    // this offset before (or it had lower alignment), then we need to remember
+    // that requirement.
+    // Note that skipping loads of previously seen offsets is only correct
+    // because we only allow a single type for a given offset, which also means
+    // that the number of accessed bytes will be the same.
+    if (!GuaranteedToExecute &&
+        (OffsetNotSeenBefore || Part.Alignment < LI->getAlign())) {
+      // We won't be able to prove dereferenceability for negative offsets.
+      if (Off < 0)
+        return false;
 
-        if (BaseTy && LI->getType() != BaseTy)
-          return false;
+      // If the offset is not aligned, an aligned base pointer won't help.
+      if (!isAligned(LI->getAlign(), Off))
+        return false;
 
-        BaseTy = LI->getType();
-      }
+      NeededDerefBytes = std::max(NeededDerefBytes, Off + Size.getFixedValue());
+      NeededAlign = std::max(NeededAlign, LI->getAlign());
     }
 
+    Part.Alignment = std::max(Part.Alignment, LI->getAlign());
+    return true;
+  };
+
+  // Look for loads that are guaranteed to execute on entry.
+  for (Instruction &I : Arg->getParent()->getEntryBlock()) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(&I))
+      if (Optional<bool> Res = HandleLoad(LI, /* GuaranteedToExecute */ true))
+        if (!*Res)
+          return false;
+
     if (!isGuaranteedToTransferExecutionToSuccessor(&I))
       break;
   }
 
-  // Now, iterate all uses of the argument to see if there are any uses that are
-  // not (GEP+)loads, or any (GEP+)loads that are not safe to promote.
+  // Now look at all loads of the argument. Remember the load instructions
+  // for the aliasing check below.
+  SmallVector<Value *, 16> Worklist;
+  SmallPtrSet<Value *, 16> Visited;
   SmallVector<LoadInst *, 16> Loads;
-  IndicesVector Operands;
-  for (Use &U : Arg->uses()) {
-    User *UR = U.getUser();
-    Operands.clear();
-    if (LoadInst *LI = dyn_cast<LoadInst>(UR)) {
-      // Don't hack volatile/atomic loads
-      if (!LI->isSimple())
-        return false;
-      Loads.push_back(LI);
-      // Direct loads are equivalent to a GEP with a zero index and then a load.
-      Operands.push_back(0);
+  auto AppendUsers = [&](Value *V) {
+    for (User *U : V->users())
+      if (Visited.insert(U).second)
+        Worklist.push_back(U);
+  };
+  AppendUsers(Arg);
+  while (!Worklist.empty()) {
+    Value *V = Worklist.pop_back_val();
+    if (isa<BitCastInst>(V)) {
+      AppendUsers(V);
+      continue;
+    }
 
-      if (!UpdateBaseTy(LI->getType()))
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(V)) {
+      if (!GEP->hasAllConstantIndices())
         return false;
-    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UR)) {
-      if (GEP->use_empty()) {
-        // Dead GEP's cause trouble later.  Just remove them if we run into
-        // them.
-        continue;
-      }
+      AppendUsers(V);
+      continue;
+    }
 
-      if (!UpdateBaseTy(GEP->getSourceElementType()))
+    if (auto *LI = dyn_cast<LoadInst>(V)) {
+      if (!*HandleLoad(LI, /* GuaranteedToExecute */ false))
         return false;
-
-      // Ensure that all of the indices are constants.
-      for (Use &Idx : GEP->indices())
-        if (ConstantInt *C = dyn_cast<ConstantInt>(Idx))
-          Operands.push_back(C->getSExtValue());
-        else
-          return false; // Not a constant operand GEP!
-
-      // Ensure that the only users of the GEP are load instructions.
-      for (User *GEPU : GEP->users())
-        if (LoadInst *LI = dyn_cast<LoadInst>(GEPU)) {
-          // Don't hack volatile/atomic loads
-          if (!LI->isSimple())
-            return false;
-          Loads.push_back(LI);
-        } else {
-          // Other uses than load?
-          return false;
-        }
-    } else {
-      return false; // Not a load or a GEP.
+      Loads.push_back(LI);
+      continue;
     }
 
-    // Now, see if it is safe to promote this load / loads of this GEP. Loading
-    // is safe if Operands, or a prefix of Operands, is marked as safe.
-    if (!prefixIn(Operands, SafeToUnconditionallyLoad))
-      return false;
+    // Unknown user.
+    LLVM_DEBUG(dbgs() << "ArgPromotion of " << *Arg << " failed: "
+                      << "unknown user " << *V << "\n");
+    return false;
+  }
 
-    // See if we are already promoting a load with these indices. If not, check
-    // to make sure that we aren't promoting too many elements.  If so, nothing
-    // to do.
-    if (ToPromote.find(Operands) == ToPromote.end()) {
-      if (MaxElements > 0 && ToPromote.size() == MaxElements) {
-        LLVM_DEBUG(dbgs() << "argpromotion not promoting argument '"
-                          << Arg->getName()
-                          << "' because it would require adding more "
-                          << "than " << MaxElements
-                          << " arguments to the function.\n");
-        // We limit aggregate promotion to only promoting up to a fixed number
-        // of elements of the aggregate.
-        return false;
-      }
-      ToPromote.insert(std::move(Operands));
+  if (NeededDerefBytes || NeededAlign > 1) {
+    // Try to prove a required deref / aligned requirement.
+    if (!allCallersPassValidPointerForArgument(Arg, NeededAlign,
+                                               NeededDerefBytes)) {
+      LLVM_DEBUG(dbgs() << "ArgPromotion of " << *Arg << " failed: "
+                        << "not dereferenceable or aligned\n");
+      return false;
     }
   }
 
-  if (Loads.empty())
+  if (ArgParts.empty())
     return true; // No users, this is a dead argument.
 
+  // Sort parts by offset.
+  append_range(ArgPartsVec, ArgParts);
+  sort(ArgPartsVec,
+       [](const auto &A, const auto &B) { return A.first < B.first; });
+
+  // Make sure the parts are non-overlapping.
+  // TODO: As we're doing pure load promotion here, overlap should be fine from
+  // a correctness perspective. Profitability is less obvious though.
+  int64_t Offset = ArgPartsVec[0].first;
+  for (const auto &Pair : ArgPartsVec) {
+    if (Pair.first < Offset)
+      return false; // Overlap with previous part.
+
+    Offset = Pair.first + DL.getTypeStoreSize(Pair.second.Ty);
+  }
+
   // Okay, now we know that the argument is only used by load instructions and
   // it is safe to unconditionally perform all of them. Use alias analysis to
   // check to see if the pointer is guaranteed to not be modified from entry of
@@ -836,30 +732,19 @@ static bool canPaddingBeAccessed(Argument *arg) {
   return false;
 }
 
-/// Check if callers and the callee \p F agree how promoted arguments would be
-/// passed. The ones that they do not agree on are eliminated from the sets but
-/// the return value has to be observed as well.
-static bool areFunctionArgsABICompatible(
-    const Function &F, const TargetTransformInfo &TTI,
-    SmallPtrSetImpl<Argument *> &ArgsToPromote,
-    SmallPtrSetImpl<Argument *> &ByValArgsToTransform) {
-  // TODO: Check individual arguments so we can promote a subset?
-  SmallVector<Type *, 32> Types;
-  for (Argument *Arg : ArgsToPromote)
-    Types.push_back(Arg->getType()->getPointerElementType());
-  for (Argument *Arg : ByValArgsToTransform)
-    Types.push_back(Arg->getParamByValType());
-
-  for (const Use &U : F.uses()) {
+/// Check if callers and callee agree on how promoted arguments would be
+/// passed.
+static bool areTypesABICompatible(ArrayRef<Type *> Types, const Function &F,
+                                  const TargetTransformInfo &TTI) {
+  return all_of(F.uses(), [&](const Use &U) {
     CallBase *CB = dyn_cast<CallBase>(U.getUser());
     if (!CB)
       return false;
+
     const Function *Caller = CB->getCaller();
     const Function *Callee = CB->getCalledFunction();
-    if (!TTI.areTypesABICompatible(Caller, Callee, Types))
-      return false;
-  }
-  return true;
+    return TTI.areTypesABICompatible(Caller, Callee, Types);
+  });
 }
 
 /// PromoteArguments - This method checks the specified function to see if there
@@ -933,11 +818,9 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
 
   // Check to see which arguments are promotable.  If an argument is promotable,
   // add it to ArgsToPromote.
-  SmallPtrSet<Argument *, 8> ArgsToPromote;
+  DenseMap<Argument *, SmallVector<OffsetAndArgPart, 4>> ArgsToPromote;
   SmallPtrSet<Argument *, 8> ByValArgsToTransform;
   for (Argument *PtrArg : PointerArgs) {
-    Type *AgTy = PtrArg->getType()->getPointerElementType();
-
     // Replace sret attribute with noalias. This reduces register pressure by
     // avoiding a register copy.
     if (PtrArg->hasStructRetAttr()) {
@@ -957,11 +840,13 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
     //
     // Only handle arguments with specified alignment; if it's unspecified, the
     // actual alignment of the argument is target-specific.
-    bool isSafeToPromote = PtrArg->hasByValAttr() && PtrArg->getParamAlign() &&
-                           (ArgumentPromotionPass::isDenselyPacked(AgTy, DL) ||
-                            !canPaddingBeAccessed(PtrArg));
+    Type *ByValTy = PtrArg->getParamByValType();
+    bool isSafeToPromote =
+        ByValTy && PtrArg->getParamAlign() &&
+        (ArgumentPromotionPass::isDenselyPacked(ByValTy, DL) ||
+         !canPaddingBeAccessed(PtrArg));
     if (isSafeToPromote) {
-      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
+      if (StructType *STy = dyn_cast<StructType>(ByValTy)) {
         if (MaxElements > 0 && STy->getNumElements() > MaxElements) {
           LLVM_DEBUG(dbgs() << "argpromotion disable promoting argument '"
                             << PtrArg->getName()
@@ -971,51 +856,39 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
           continue;
         }
 
+        SmallVector<Type *, 4> Types;
+        append_range(Types, STy->elements());
+
         // If all the elements are single-value types, we can promote it.
-        bool AllSimple = true;
-        for (const auto *EltTy : STy->elements()) {
-          if (!EltTy->isSingleValueType()) {
-            AllSimple = false;
-            break;
-          }
-        }
+        bool AllSimple =
+            all_of(Types, [](Type *Ty) { return Ty->isSingleValueType(); });
 
         // Safe to transform, don't even bother trying to "promote" it.
         // Passing the elements as a scalar will allow sroa to hack on
         // the new alloca we introduce.
-        if (AllSimple) {
+        if (AllSimple && areTypesABICompatible(Types, *F, TTI)) {
           ByValArgsToTransform.insert(PtrArg);
           continue;
         }
       }
     }
 
-    // If the argument is a recursive type and we're in a recursive
-    // function, we could end up infinitely peeling the function argument.
-    if (isSelfRecursive) {
-      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
-        bool RecursiveType =
-            llvm::is_contained(STy->elements(), PtrArg->getType());
-        if (RecursiveType)
-          continue;
-      }
-    }
-
     // Otherwise, see if we can promote the pointer to its value.
-    Type *ByValTy =
-        PtrArg->hasByValAttr() ? PtrArg->getParamByValType() : nullptr;
-    if (isSafeToPromoteArgument(PtrArg, ByValTy, AAR, MaxElements))
-      ArgsToPromote.insert(PtrArg);
+    SmallVector<OffsetAndArgPart, 4> ArgParts;
+    if (findArgParts(PtrArg, DL, AAR, MaxElements, isSelfRecursive, ArgParts)) {
+      SmallVector<Type *, 4> Types;
+      for (const auto &Pair : ArgParts)
+        Types.push_back(Pair.second.Ty);
+
+      if (areTypesABICompatible(Types, *F, TTI))
+        ArgsToPromote.insert({PtrArg, std::move(ArgParts)});
+    }
   }
 
   // No promotable pointer arguments.
   if (ArgsToPromote.empty() && ByValArgsToTransform.empty())
     return nullptr;
 
-  if (!areFunctionArgsABICompatible(
-          *F, TTI, ArgsToPromote, ByValArgsToTransform))
-    return nullptr;
-
   return doPromotion(F, ArgsToPromote, ByValArgsToTransform, ReplaceCallSite);
 }
 

diff  --git a/llvm/test/Transforms/ArgumentPromotion/align.ll b/llvm/test/Transforms/ArgumentPromotion/align.ll
index 7e30e8b50a48..5591ff09c78e 100644
--- a/llvm/test/Transforms/ArgumentPromotion/align.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/align.ll
@@ -3,8 +3,8 @@
 
 define internal i32 @callee_must_exec(i32* %p) {
 ; CHECK-LABEL: define {{[^@]+}}@callee_must_exec
-; CHECK-SAME: (i32 [[P_VAL:%.*]]) {
-; CHECK-NEXT:    ret i32 [[P_VAL]]
+; CHECK-SAME: (i32 [[P_0_VAL:%.*]]) {
+; CHECK-NEXT:    ret i32 [[P_0_VAL]]
 ;
   %x = load i32, i32* %p, align 16
   ret i32 %x
@@ -23,10 +23,10 @@ define void @caller_must_exec(i32* %p) {
 
 define internal i32 @callee_guaranteed_aligned_1(i1 %c, i32* %p) {
 ; CHECK-LABEL: define {{[^@]+}}@callee_guaranteed_aligned_1
-; CHECK-SAME: (i1 [[C:%.*]], i32 [[P_VAL:%.*]]) {
+; CHECK-SAME: (i1 [[C:%.*]], i32 [[P_0_VAL:%.*]]) {
 ; CHECK-NEXT:    br i1 [[C]], label [[IF:%.*]], label [[ELSE:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    ret i32 [[P_VAL]]
+; CHECK-NEXT:    ret i32 [[P_0_VAL]]
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i32 -1
 ;
@@ -53,10 +53,10 @@ define void @caller_guaranteed_aligned_1(i1 %c, i32* align 16 dereferenceable(4)
 
 define internal i32 @callee_guaranteed_aligned_2(i1 %c, i32* align 16 dereferenceable(4) %p) {
 ; CHECK-LABEL: define {{[^@]+}}@callee_guaranteed_aligned_2
-; CHECK-SAME: (i1 [[C:%.*]], i32 [[P_VAL:%.*]]) {
+; CHECK-SAME: (i1 [[C:%.*]], i32 [[P_0_VAL:%.*]]) {
 ; CHECK-NEXT:    br i1 [[C]], label [[IF:%.*]], label [[ELSE:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    ret i32 [[P_VAL]]
+; CHECK-NEXT:    ret i32 [[P_0_VAL]]
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i32 -1
 ;
@@ -81,14 +81,15 @@ define void @caller_guaranteed_aligned_2(i1 %c, i32* %p) {
   ret void
 }
 
-; TODO: This should not be promoted, as the caller only guarantees that the
+; This should not be promoted, as the caller only guarantees that the
 ; pointer is dereferenceable, not that it is aligned.
 define internal i32 @callee_not_guaranteed_aligned(i1 %c, i32* %p) {
 ; CHECK-LABEL: define {{[^@]+}}@callee_not_guaranteed_aligned
-; CHECK-SAME: (i1 [[C:%.*]], i32 [[P_VAL:%.*]]) {
+; CHECK-SAME: (i1 [[C:%.*]], i32* [[P:%.*]]) {
 ; CHECK-NEXT:    br i1 [[C]], label [[IF:%.*]], label [[ELSE:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    ret i32 [[P_VAL]]
+; CHECK-NEXT:    [[X:%.*]] = load i32, i32* [[P]], align 16
+; CHECK-NEXT:    ret i32 [[X]]
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i32 -1
 ;
@@ -105,8 +106,7 @@ else:
 define void @caller_not_guaranteed_aligned(i1 %c, i32* dereferenceable(4) %p) {
 ; CHECK-LABEL: define {{[^@]+}}@caller_not_guaranteed_aligned
 ; CHECK-SAME: (i1 [[C:%.*]], i32* dereferenceable(4) [[P:%.*]]) {
-; CHECK-NEXT:    [[P_VAL:%.*]] = load i32, i32* [[P]], align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @callee_not_guaranteed_aligned(i1 [[C]], i32 [[P_VAL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @callee_not_guaranteed_aligned(i1 [[C]], i32* [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call i32 @callee_not_guaranteed_aligned(i1 %c, i32* %p)

diff  --git a/llvm/test/Transforms/ArgumentPromotion/bitcasts.ll b/llvm/test/Transforms/ArgumentPromotion/bitcasts.ll
index 68a3223cf997..5bc99dbed975 100644
--- a/llvm/test/Transforms/ArgumentPromotion/bitcasts.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/bitcasts.ll
@@ -7,13 +7,8 @@
 
 define internal i32 @callee_basic(i8* %p) {
 ; CHECK-LABEL: define {{[^@]+}}@callee_basic
-; CHECK-SAME: (i8* [[P:%.*]]) {
-; CHECK-NEXT:    [[P_32:%.*]] = bitcast i8* [[P]] to i32*
-; CHECK-NEXT:    [[X:%.*]] = load i32, i32* [[P_32]], align 4
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, i8* [[P]], i64 4
-; CHECK-NEXT:    [[P1_32:%.*]] = bitcast i8* [[P1]] to i32*
-; CHECK-NEXT:    [[Y:%.*]] = load i32, i32* [[P1_32]], align 4
-; CHECK-NEXT:    [[Z:%.*]] = add i32 [[X]], [[Y]]
+; CHECK-SAME: (i32 [[P_0_VAL:%.*]], i32 [[P_4_VAL:%.*]]) {
+; CHECK-NEXT:    [[Z:%.*]] = add i32 [[P_0_VAL]], [[P_4_VAL]]
 ; CHECK-NEXT:    ret i32 [[Z]]
 ;
   %p.32 = bitcast i8* %p to i32*
@@ -28,7 +23,12 @@ define internal i32 @callee_basic(i8* %p) {
 define void @caller_basic(i8* %p) {
 ; CHECK-LABEL: define {{[^@]+}}@caller_basic
 ; CHECK-SAME: (i8* [[P:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @callee_basic(i8* [[P]])
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P]] to i32*
+; CHECK-NEXT:    [[P_VAL:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8* [[P]], i64 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to i32*
+; CHECK-NEXT:    [[P_VAL1:%.*]] = load i32, i32* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @callee_basic(i32 [[P_VAL]], i32 [[P_VAL1]])
 ; CHECK-NEXT:    ret void
 ;
   call i32 @callee_basic(i8* %p)
@@ -37,12 +37,8 @@ define void @caller_basic(i8* %p) {
 
 define internal i32 @callee_opaque(%opaque* %p) {
 ; CHECK-LABEL: define {{[^@]+}}@callee_opaque
-; CHECK-SAME: (%opaque* [[P:%.*]]) {
-; CHECK-NEXT:    [[P_32:%.*]] = bitcast %opaque* [[P]] to i32*
-; CHECK-NEXT:    [[X:%.*]] = load i32, i32* [[P_32]], align 4
-; CHECK-NEXT:    [[P1_32:%.*]] = getelementptr i32, i32* [[P_32]], i64 1
-; CHECK-NEXT:    [[Y:%.*]] = load i32, i32* [[P1_32]], align 4
-; CHECK-NEXT:    [[Z:%.*]] = add i32 [[X]], [[Y]]
+; CHECK-SAME: (i32 [[P_0_VAL:%.*]], i32 [[P_4_VAL:%.*]]) {
+; CHECK-NEXT:    [[Z:%.*]] = add i32 [[P_0_VAL]], [[P_4_VAL]]
 ; CHECK-NEXT:    ret i32 [[Z]]
 ;
   %p.32 = bitcast %opaque* %p to i32*
@@ -56,7 +52,13 @@ define internal i32 @callee_opaque(%opaque* %p) {
 define void @caller_opaque(%opaque* %p) {
 ; CHECK-LABEL: define {{[^@]+}}@caller_opaque
 ; CHECK-SAME: (%opaque* [[P:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @callee_opaque(%opaque* [[P]])
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast %opaque* [[P]] to i32*
+; CHECK-NEXT:    [[P_VAL:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %opaque* [[P]] to i8*
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP2]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32*
+; CHECK-NEXT:    [[P_VAL1:%.*]] = load i32, i32* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @callee_opaque(i32 [[P_VAL]], i32 [[P_VAL1]])
 ; CHECK-NEXT:    ret void
 ;
   call i32 @callee_opaque(%opaque* %p)

diff  --git a/llvm/test/Transforms/ArgumentPromotion/fp80.ll b/llvm/test/Transforms/ArgumentPromotion/fp80.ll
index 31cf60a9439c..d791fd15a559 100644
--- a/llvm/test/Transforms/ArgumentPromotion/fp80.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/fp80.ll
@@ -15,12 +15,17 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @run() {
 ; CHECK-LABEL: define {{[^@]+}}@run() {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @UseLongDoubleUnsafely(%union.u* byval([[UNION_U:%.*]]) align 16 bitcast (%struct.s* @b to %union.u*))
-; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[UNION_U]], %union.u* bitcast (%struct.s* @b to %union.u*), i32 0, i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast %union.u* bitcast (%struct.s* @b to %union.u*) to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 10
+; CHECK-NEXT:    [[DOTVAL:%.*]] = load i8, i8* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i8 @UseLongDoubleUnsafely(i8 [[DOTVAL]])
+; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[UNION_U:%.*]], %union.u* bitcast (%struct.s* @b to %union.u*), i32 0, i32 0
 ; CHECK-NEXT:    [[DOT0_VAL:%.*]] = load x86_fp80, x86_fp80* [[DOT0]], align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call x86_fp80 @UseLongDoubleSafely(x86_fp80 [[DOT0_VAL]])
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @AccessPaddingOfStruct(%struct.Foo* byval([[STRUCT_FOO:%.*]]) @a)
-; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @CaptureAStruct(%struct.Foo* byval([[STRUCT_FOO]]) @a)
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_fp80 @UseLongDoubleSafely(x86_fp80 [[DOT0_VAL]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.Foo* @a to i64*
+; CHECK-NEXT:    [[A_VAL:%.*]] = load i64, i64* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @AccessPaddingOfStruct(i64 [[A_VAL]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @CaptureAStruct(%struct.Foo* byval([[STRUCT_FOO:%.*]]) @a)
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -33,12 +38,9 @@ entry:
 
 define internal i8 @UseLongDoubleUnsafely(%union.u* byval(%union.u) align 16 %arg) {
 ; CHECK-LABEL: define {{[^@]+}}@UseLongDoubleUnsafely
-; CHECK-SAME: (%union.u* byval([[UNION_U:%.*]]) align 16 [[ARG:%.*]]) {
+; CHECK-SAME: (i8 [[ARG_10_VAL:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast %union.u* [[ARG]] to %struct.s*
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.s* [[BITCAST]], i64 0, i32 2
-; CHECK-NEXT:    [[RESULT:%.*]] = load i8, i8* [[GEP]], align 1
-; CHECK-NEXT:    ret i8 [[RESULT]]
+; CHECK-NEXT:    ret i8 [[ARG_10_VAL]]
 ;
 entry:
   %bitcast = bitcast %union.u* %arg to %struct.s*
@@ -64,10 +66,8 @@ define internal x86_fp80 @UseLongDoubleSafely(%union.u* byval(%union.u) align 16
 
 define internal i64 @AccessPaddingOfStruct(%struct.Foo* byval(%struct.Foo) %a) {
 ; CHECK-LABEL: define {{[^@]+}}@AccessPaddingOfStruct
-; CHECK-SAME: (%struct.Foo* byval([[STRUCT_FOO:%.*]]) [[A:%.*]]) {
-; CHECK-NEXT:    [[P:%.*]] = bitcast %struct.Foo* [[A]] to i64*
-; CHECK-NEXT:    [[V:%.*]] = load i64, i64* [[P]], align 8
-; CHECK-NEXT:    ret i64 [[V]]
+; CHECK-SAME: (i64 [[A_0_VAL:%.*]]) {
+; CHECK-NEXT:    ret i64 [[A_0_VAL]]
 ;
   %p = bitcast %struct.Foo* %a to i64*
   %v = load i64, i64* %p

diff  --git a/llvm/test/Transforms/ArgumentPromotion/opaque-ptr.ll b/llvm/test/Transforms/ArgumentPromotion/opaque-ptr.ll
new file mode 100644
index 000000000000..25939bc900b8
--- /dev/null
+++ b/llvm/test/Transforms/ArgumentPromotion/opaque-ptr.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --scrub-attributes
+; RUN: opt -S -argpromotion -opaque-pointers < %s | FileCheck %s
+
+define internal i32 @callee_basic(ptr %p) {
+; CHECK-LABEL: define {{[^@]+}}@callee_basic
+; CHECK-SAME: (i32 [[P_0_VAL:%.*]], i32 [[P_4_VAL:%.*]]) {
+; CHECK-NEXT:    [[Z:%.*]] = add i32 [[P_0_VAL]], [[P_4_VAL]]
+; CHECK-NEXT:    ret i32 [[Z]]
+;
+  %x = load i32, ptr %p
+  %p1 = getelementptr i8, ptr %p, i64 4
+  %y = load i32, ptr %p1
+  %z = add i32 %x, %y
+  ret i32 %z
+}
+
+define void @caller_basic(ptr %p) {
+; CHECK-LABEL: define {{[^@]+}}@caller_basic
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[P_VAL:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[P]], i64 4
+; CHECK-NEXT:    [[P_VAL1:%.*]] = load i32, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @callee_basic(i32 [[P_VAL]], i32 [[P_VAL1]])
+; CHECK-NEXT:    ret void
+;
+  call i32 @callee_basic(ptr %p)
+  ret void
+}
+
+; Same offset is loaded with two 
diff eren types: Don't promote.
+define internal i32 @callee_
diff erent_types(ptr %p) {
+; CHECK-LABEL: define {{[^@]+}}@callee_
diff erent_types
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[Y_F:%.*]] = load float, ptr [[P]], align 4
+; CHECK-NEXT:    [[Y:%.*]] = fptoui float [[Y_F]] to i32
+; CHECK-NEXT:    [[Z:%.*]] = add i32 [[X]], [[Y]]
+; CHECK-NEXT:    ret i32 [[Z]]
+;
+  %x = load i32, ptr %p
+  %y.f = load float, ptr %p
+  %y = fptoui float %y.f to i32
+  %z = add i32 %x, %y
+  ret i32 %z
+}
+
+define void @caller_
diff erent_types(ptr %p) {
+; CHECK-LABEL: define {{[^@]+}}@caller_
diff erent_types
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @callee_
diff erent_types(ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call i32 @callee_
diff erent_types(ptr %p)
+  ret void
+}
+
+; The two loads overlap: Don't promote.
+define internal i32 @callee_overlap(ptr %p) {
+; CHECK-LABEL: define {{[^@]+}}@callee_overlap
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT:    [[Y:%.*]] = load i32, ptr [[P1]], align 4
+; CHECK-NEXT:    [[Z:%.*]] = add i32 [[X]], [[Y]]
+; CHECK-NEXT:    ret i32 [[Z]]
+;
+  %x = load i32, ptr %p
+  %p1 = getelementptr i8, ptr %p, i64 2
+  %y = load i32, ptr %p1
+  %z = add i32 %x, %y
+  ret i32 %z
+}
+
+define void @caller_overlap(ptr %p) {
+; CHECK-LABEL: define {{[^@]+}}@caller_overlap
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @callee_overlap(ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call i32 @callee_overlap(ptr %p)
+  ret void
+}