[llvm] 170c4d2 - [ArgPromotion] Unify byval promotion with non-byval

Tue Jun 28 05:23:11 PDT 2022

Author: Pavel Samolysov
Date: 2022-06-28T15:19:58+03:00
New Revision: 170c4d21bd94d4f183c2fec1dd7d261360df7bae

URL: https://github.com/llvm/llvm-project/commit/170c4d21bd94d4f183c2fec1dd7d261360df7bae
DIFF: https://github.com/llvm/llvm-project/commit/170c4d21bd94d4f183c2fec1dd7d261360df7bae.diff

LOG: [ArgPromotion] Unify byval promotion with non-byval

It makes sense to handle byval promotion in the same way as non-byval
but also allowing `store` instructions. However, these should
use the same checks as the `load` instructions do, i.e. be part of the
`ArgsToPromote` collection. For these instructions, the check for
interfering modifications can be disabled, though. The promotion
algorithm itself has been modified a lot: all the accesses (i.e. loads
and stores) are rewritten to the emitted `alloca` instructions. To
optimize these new `alloca`s out, the `PromoteMemToReg` function from
`Transforms/Utils/PromoteMemoryToRegister.cpp` file is invoked after
promotion.

In order to let the `PromoteMemToReg` promote as many `alloca`s as it
is possible, there should be no `GEP`s from the `alloca`s. To
eliminate the `GEP`s, its own `alloca` is generated for every argument
part because a single `alloca` for the whole argument (that
significantly simplifies the code of the pass though) unfortunately
cannot be used.

The idea comes from the following discussion:
https://reviews.llvm.org/D124514#3479676

Differential Revision: https://reviews.llvm.org/D125485

Added: 
    llvm/test/Transforms/ArgumentPromotion/byval-with-padding.ll
    llvm/test/Transforms/ArgumentPromotion/store-after-load.ll
    llvm/test/Transforms/ArgumentPromotion/store-into-inself.ll

Modified: 
    llvm/include/llvm/Transforms/IPO/ArgumentPromotion.h
    llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
    llvm/test/Transforms/ArgumentPromotion/attrs.ll
    llvm/test/Transforms/ArgumentPromotion/byval-2.ll
    llvm/test/Transforms/ArgumentPromotion/byval.ll
    llvm/test/Transforms/ArgumentPromotion/dbg.ll
    llvm/test/Transforms/ArgumentPromotion/fp80.ll
    llvm/test/Transforms/ArgumentPromotion/metadata.ll

Removed: 
    llvm/test/Transforms/ArgumentPromotion/byval-through-pointer-promotion.ll


################################################################################
diff  --git a/llvm/include/llvm/Transforms/IPO/ArgumentPromotion.h b/llvm/include/llvm/Transforms/IPO/ArgumentPromotion.h
index 35481843c0e3e..ac08b6c8877ac 100644

--- a/llvm/include/llvm/Transforms/IPO/ArgumentPromotion.h
+++ b/llvm/include/llvm/Transforms/IPO/ArgumentPromotion.h
@@ -28,6 +28,8 @@ class ArgumentPromotionPass : public PassInfoMixin<ArgumentPromotionPass> {
   ArgumentPromotionPass(unsigned MaxElements = 2u) : MaxElements(MaxElements) {}
 
   /// Checks if a type could have padding bytes.
+  // TODO the function aren't used in the ArgumentPromotionPass anymore and
+  // should be moved into AttributorAttributes.cpp as the single known user.
   static bool isDenselyPacked(Type *Ty, const DataLayout &DL);
 
   PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM,

diff  --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index 9470d6641b364..1836fd774606a 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -29,6 +29,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/ArgumentPromotion.h"
+
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
@@ -56,6 +57,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
@@ -75,6 +77,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -86,7 +89,6 @@ using namespace llvm;
 #define DEBUG_TYPE "argpromotion"
 
 STATISTIC(NumArgumentsPromoted, "Number of pointer arguments promoted");
-STATISTIC(NumByValArgsPromoted, "Number of byval arguments promoted");
 STATISTIC(NumArgumentsDead, "Number of dead pointer args eliminated");
 
 namespace {
@@ -94,9 +96,9 @@ namespace {
 struct ArgPart {
   Type *Ty;
   Align Alignment;
-  /// A representative guaranteed-executed load instruction for use by
+  /// A representative guaranteed-executed load or store instruction for use by
   /// metadata transfer.
-  LoadInst *MustExecLoad;
+  Instruction *MustExecInstr;
 };
 
 using OffsetAndArgPart = std::pair<int64_t, ArgPart>;
@@ -154,9 +156,9 @@ static Value *createByteGEP(IRBuilderBase &IRB, const DataLayout &DL,
 /// arguments, and returns the new function.  At this point, we know that it's
 /// safe to do so.
 static Function *doPromotion(
-    Function *F,
+    Function *F, function_ref<DominatorTree &(Function &F)> DTGetter,
+    function_ref<AssumptionCache *(Function &F)> ACGetter,
     const DenseMap<Argument *, SmallVector<OffsetAndArgPart, 4>> &ArgsToPromote,
-    SmallPtrSetImpl<Argument *> &ByValArgsToTransform,
     Optional<function_ref<void(CallBase &OldCS, CallBase &NewCS)>>
         ReplaceCallSite) {
   // Start by computing a new prototype for the function, which is the same as
@@ -174,15 +176,7 @@ static Function *doPromotion(
   unsigned ArgNo = 0;
   for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
        ++I, ++ArgNo) {
-    if (ByValArgsToTransform.count(&*I)) {
-      // Simple byval argument? Just add all the struct element types.
-      Type *AgTy = I->getParamByValType();
-      StructType *STy = cast<StructType>(AgTy);
-      llvm::append_range(Params, STy->elements());
-      ArgAttrVec.insert(ArgAttrVec.end(), STy->getNumElements(),
-                        AttributeSet());
-      ++NumByValArgsPromoted;
-    } else if (!ArgsToPromote.count(&*I)) {
+    if (!ArgsToPromote.count(&*I)) {
       // Unchanged argument
       Params.push_back(I->getType());
       ArgAttrVec.push_back(PAL.getParamAttrs(ArgNo));
@@ -250,29 +244,10 @@ static Function *doPromotion(
     auto *AI = CB.arg_begin();
     ArgNo = 0;
     for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
-         ++I, ++AI, ++ArgNo)
-      if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
+         ++I, ++AI, ++ArgNo) {
+      if (!ArgsToPromote.count(&*I)) {
         Args.push_back(*AI); // Unmodified argument
         ArgAttrVec.push_back(CallPAL.getParamAttrs(ArgNo));
-      } else if (ByValArgsToTransform.count(&*I)) {
-        // Emit a GEP and load for each element of the struct.
-        Type *AgTy = I->getParamByValType();
-        StructType *STy = cast<StructType>(AgTy);
-        Value *Idxs[2] = {
-            ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr};
-        const StructLayout *SL = DL.getStructLayout(STy);
-        Align StructAlign = *I->getParamAlign();
-        for (unsigned J = 0, Elems = STy->getNumElements(); J != Elems; ++J) {
-          Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), J);
-          auto *Idx =
-              IRB.CreateGEP(STy, *AI, Idxs, (*AI)->getName() + "." + Twine(J));
-          // TODO: Tell AA about the new values?
-          Align Alignment =
-              commonAlignment(StructAlign, SL->getElementOffset(J));
-          Args.push_back(IRB.CreateAlignedLoad(
-              STy->getElementType(J), Idx, Alignment, Idx->getName() + ".val"));
-          ArgAttrVec.push_back(AttributeSet());
-        }
       } else if (!I->use_empty()) {
         Value *V = *AI;
         const auto &ArgParts = ArgsToPromote.find(&*I)->second;
@@ -281,9 +256,9 @@ static Function *doPromotion(
               Pair.second.Ty,
               createByteGEP(IRB, DL, V, Pair.second.Ty, Pair.first),
               Pair.second.Alignment, V->getName() + ".val");
-          if (Pair.second.MustExecLoad) {
-            LI->setAAMetadata(Pair.second.MustExecLoad->getAAMetadata());
-            LI->copyMetadata(*Pair.second.MustExecLoad,
+          if (Pair.second.MustExecInstr) {
+            LI->setAAMetadata(Pair.second.MustExecInstr->getAAMetadata());
+            LI->copyMetadata(*Pair.second.MustExecInstr,
                              {LLVMContext::MD_range, LLVMContext::MD_nonnull,
                               LLVMContext::MD_dereferenceable,
                               LLVMContext::MD_dereferenceable_or_null,
@@ -293,6 +268,7 @@ static Function *doPromotion(
           ArgAttrVec.push_back(AttributeSet());
         }
       }
+    }
 
     // Push any varargs arguments on the list.
     for (; AI != CB.arg_end(); ++AI, ++ArgNo) {
@@ -342,11 +318,15 @@ static Function *doPromotion(
   // function empty.
   NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
 
+  // We will collect all the new created allocas to promote them into registers
+  // after the following loop
+  SmallVector<AllocaInst *, 4> Allocas;
+
   // Loop over the argument list, transferring uses of the old arguments over to
   // the new arguments, also transferring over the names as well.
   Function::arg_iterator I2 = NF->arg_begin();
   for (Argument &Arg : F->args()) {
-    if (!ArgsToPromote.count(&Arg) && !ByValArgsToTransform.count(&Arg)) {
+    if (!ArgsToPromote.count(&Arg)) {
       // If this is an unmodified argument, move the name and users over to the
       // new version.
       Arg.replaceAllUsesWith(&*I2);
@@ -355,37 +335,6 @@ static Function *doPromotion(
       continue;
     }
 
-    if (ByValArgsToTransform.count(&Arg)) {
-      // In the callee, we create an alloca, and store each of the new incoming
-      // arguments into the alloca.
-      Instruction *InsertPt = &NF->begin()->front();
-
-      // Just add all the struct element types.
-      Type *AgTy = Arg.getParamByValType();
-      Align StructAlign = *Arg.getParamAlign();
-      Value *TheAlloca = new AllocaInst(AgTy, DL.getAllocaAddrSpace(), nullptr,
-                                        StructAlign, "", InsertPt);
-      StructType *STy = cast<StructType>(AgTy);
-      Value *Idxs[2] = {ConstantInt::get(Type::getInt32Ty(F->getContext()), 0),
-                        nullptr};
-      const StructLayout *SL = DL.getStructLayout(STy);
-
-      for (unsigned J = 0, Elems = STy->getNumElements(); J != Elems; ++J) {
-        Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), J);
-        Value *Idx = GetElementPtrInst::Create(
-            AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(J),
-            InsertPt);
-        I2->setName(Arg.getName() + "." + Twine(J));
-        Align Alignment = commonAlignment(StructAlign, SL->getElementOffset(J));
-        new StoreInst(&*I2++, Idx, false, Alignment, InsertPt);
-      }
-
-      // Anything that used the arg should now use the alloca.
-      Arg.replaceAllUsesWith(TheAlloca);
-      TheAlloca->takeName(&Arg);
-      continue;
-    }
-
     // There potentially are metadata uses for things like llvm.dbg.value.
     // Replace them with undef, after handling the other regular uses.
     auto RauwUndefMetadata = make_scope_exit(
@@ -394,16 +343,45 @@ static Function *doPromotion(
     if (Arg.use_empty())
       continue;
 
-    SmallDenseMap<int64_t, Argument *> OffsetToArg;
+    // Otherwise, if we promoted this argument, we have to create an alloca in
+    // the callee for every promotable part and store each of the new incoming
+    // arguments into the corresponding alloca, what lets the old code (the
+    // store instructions if they are allowed especially) a chance to work as
+    // before.
+    assert(Arg.getType()->isPointerTy() &&
+           "Only arguments with a pointer type are promotable");
+
+    IRBuilder<NoFolder> IRB(&NF->begin()->front());
+
+    // Add only the promoted elements, so parts from ArgsToPromote
+    SmallDenseMap<int64_t, AllocaInst *> OffsetToAlloca;
     for (const auto &Pair : ArgsToPromote.find(&Arg)->second) {
-      Argument &NewArg = *I2++;
-      NewArg.setName(Arg.getName() + "." + Twine(Pair.first) + ".val");
-      OffsetToArg.insert({Pair.first, &NewArg});
+      int64_t Offset = Pair.first;
+      const ArgPart &Part = Pair.second;
+
+      Argument *NewArg = I2++;
+      NewArg->setName(Arg.getName() + "." + Twine(Offset) + ".val");
+
+      AllocaInst *NewAlloca = IRB.CreateAlloca(
+          Part.Ty, nullptr, Arg.getName() + "." + Twine(Offset) + ".allc");
+      NewAlloca->setAlignment(Pair.second.Alignment);
+      IRB.CreateAlignedStore(NewArg, NewAlloca, Pair.second.Alignment);
+
+      // Collect the alloca to retarget the users to
+      OffsetToAlloca.insert({Offset, NewAlloca});
     }
 
-    // Otherwise, if we promoted this argument, then all users are load
-    // instructions (with possible casts and GEPs in between).
+    auto GetAlloca = [&](Value *Ptr) {
+      APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+      Ptr = Ptr->stripAndAccumulateConstantOffsets(DL, Offset,
+                                                   /* AllowNonInbounds */ true);
+      assert(Ptr == &Arg && "Not constant offset from arg?");
+      return OffsetToAlloca.lookup(Offset.getSExtValue());
+    };
 
+    // Cleanup the code from the dead instructions: GEPs and BitCasts in between
+    // the original argument and its users: loads and stores. Retarget every
+    // user to the new created alloca.
     SmallVector<Value *, 16> Worklist;
     SmallVector<Instruction *, 16> DeadInsts;
     append_range(Worklist, Arg.users());
@@ -417,13 +395,14 @@ static Function *doPromotion(
 
       if (auto *LI = dyn_cast<LoadInst>(V)) {
         Value *Ptr = LI->getPointerOperand();
-        APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
-        Ptr =
-            Ptr->stripAndAccumulateConstantOffsets(DL, Offset,
-                                                   /* AllowNonInbounds */ true);
-        assert(Ptr == &Arg && "Not constant offset from arg?");
-        LI->replaceAllUsesWith(OffsetToArg[Offset.getSExtValue()]);
-        DeadInsts.push_back(LI);
+        LI->setOperand(LoadInst::getPointerOperandIndex(), GetAlloca(Ptr));
+        continue;
+      }
+
+      if (auto *SI = dyn_cast<StoreInst>(V)) {
+        assert(!SI->isVolatile() && "Volatile operations can't be promoted.");
+        Value *Ptr = SI->getPointerOperand();
+        SI->setOperand(StoreInst::getPointerOperandIndex(), GetAlloca(Ptr));
         continue;
       }
 
@@ -434,6 +413,23 @@ static Function *doPromotion(
       I->replaceAllUsesWith(PoisonValue::get(I->getType()));
       I->eraseFromParent();
     }
+
+    // Collect the allocas for promotion
+    for (const auto &Pair : OffsetToAlloca) {
+      assert(isAllocaPromotable(Pair.second) &&
+             "By design, only promotable allocas should be produced.");
+      Allocas.push_back(Pair.second);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "ARG PROMOTION: " << Allocas.size()
+                    << " alloca(s) are promotable by Mem2Reg\n");
+
+  if (!Allocas.empty()) {
+    // And we are able to call the `promoteMemoryToRegister()` function.
+    // Our earlier checks have ensured that PromoteMemToReg() will
+    // succeed.
+    PromoteMemToReg(Allocas, DTGetter(*NF), ACGetter(*NF));
   }
 
   return NF;
@@ -456,8 +452,8 @@ static bool allCallersPassValidPointerForArgument(Argument *Arg,
   // direct callees.
   return all_of(Callee->users(), [&](User *U) {
     CallBase &CB = cast<CallBase>(*U);
-    return isDereferenceableAndAlignedPointer(
-        CB.getArgOperand(Arg->getArgNo()), NeededAlign, Bytes, DL);
+    return isDereferenceableAndAlignedPointer(CB.getArgOperand(Arg->getArgNo()),
+                                              NeededAlign, Bytes, DL);
   });
 }
 
@@ -470,7 +466,7 @@ static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
   if (Arg->use_empty())
     return true;
 
-  // We can only promote this argument if all of the uses are loads at known
+  // We can only promote this argument if all the uses are loads at known
   // offsets.
   //
   // Promoting the argument causes it to be loaded in the caller
@@ -487,15 +483,22 @@ static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
   Align NeededAlign(1);
   uint64_t NeededDerefBytes = 0;
 
-  // Returns None if this load is not based on the argument. Return true if
-  // we can promote the load, false otherwise.
-  auto HandleLoad = [&](LoadInst *LI,
-                        bool GuaranteedToExecute) -> Optional<bool> {
-    // Don't promote volatile or atomic loads.
-    if (!LI->isSimple())
+  // And if this is a byval argument we also allow to have store instructions.
+  // Only handle in such way arguments with specified alignment;
+  // if it's unspecified, the actual alignment of the argument is
+  // target-specific.
+  bool AreStoresAllowed = Arg->getParamByValType() && Arg->getParamAlign();
+
+  // An end user of a pointer argument is a load or store instruction.
+  // Returns None if this load or store is not based on the argument. Return
+  // true if we can promote the instruction, false otherwise.
+  auto HandleEndUser = [&](auto *I, Type *Ty,
+                           bool GuaranteedToExecute) -> Optional<bool> {
+    // Don't promote volatile or atomic instructions.
+    if (!I->isSimple())
       return false;
 
-    Value *Ptr = LI->getPointerOperand();
+    Value *Ptr = I->getPointerOperand();
     APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
     Ptr = Ptr->stripAndAccumulateConstantOffsets(DL, Offset,
                                                  /* AllowNonInbounds */ true);
@@ -505,7 +508,6 @@ static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
     if (Offset.getSignificantBits() >= 64)
       return false;
 
-    Type *Ty = LI->getType();
     TypeSize Size = DL.getTypeStoreSize(Ty);
     // Don't try to promote scalable types.
     if (Size.isScalable())
@@ -518,7 +520,7 @@ static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
 
     int64_t Off = Offset.getSExtValue();
     auto Pair = ArgParts.try_emplace(
-        Off, ArgPart{Ty, LI->getAlign(), GuaranteedToExecute ? LI : nullptr});
+        Off, ArgPart{Ty, I->getAlign(), GuaranteedToExecute ? I : nullptr});
     ArgPart &Part = Pair.first->second;
     bool OffsetNotSeenBefore = Pair.second;
 
@@ -530,44 +532,49 @@ static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
       return false;
     }
 
-    // For now, we only support loading one specific type at a given offset.
+    // For now, we only support loading/storing one specific type at a given
+    // offset.
     if (Part.Ty != Ty) {
       LLVM_DEBUG(dbgs() << "ArgPromotion of " << *Arg << " failed: "
-                        << "loaded via both " << *Part.Ty << " and " << *Ty
+                        << "accessed as both " << *Part.Ty << " and " << *Ty
                         << " at offset " << Off << "\n");
       return false;
     }
 
-    // If this load is not guaranteed to execute, and we haven't seen a load at
-    // this offset before (or it had lower alignment), then we need to remember
-    // that requirement.
-    // Note that skipping loads of previously seen offsets is only correct
-    // because we only allow a single type for a given offset, which also means
-    // that the number of accessed bytes will be the same.
+    // If this instruction is not guaranteed to execute, and we haven't seen a
+    // load or store at this offset before (or it had lower alignment), then we
+    // need to remember that requirement.
+    // Note that skipping instructions of previously seen offsets is only
+    // correct because we only allow a single type for a given offset, which
+    // also means that the number of accessed bytes will be the same.
     if (!GuaranteedToExecute &&
-        (OffsetNotSeenBefore || Part.Alignment < LI->getAlign())) {
+        (OffsetNotSeenBefore || Part.Alignment < I->getAlign())) {
       // We won't be able to prove dereferenceability for negative offsets.
       if (Off < 0)
         return false;
 
       // If the offset is not aligned, an aligned base pointer won't help.
-      if (!isAligned(LI->getAlign(), Off))
+      if (!isAligned(I->getAlign(), Off))
         return false;
 
       NeededDerefBytes = std::max(NeededDerefBytes, Off + Size.getFixedValue());
-      NeededAlign = std::max(NeededAlign, LI->getAlign());
+      NeededAlign = std::max(NeededAlign, I->getAlign());
     }
 
-    Part.Alignment = std::max(Part.Alignment, LI->getAlign());
+    Part.Alignment = std::max(Part.Alignment, I->getAlign());
     return true;
   };
 
-  // Look for loads that are guaranteed to execute on entry.
+  // Look for loads and stores that are guaranteed to execute on entry.
   for (Instruction &I : Arg->getParent()->getEntryBlock()) {
+    Optional<bool> Res{};
     if (LoadInst *LI = dyn_cast<LoadInst>(&I))
-      if (Optional<bool> Res = HandleLoad(LI, /* GuaranteedToExecute */ true))
-        if (!*Res)
-          return false;
+      Res = HandleEndUser(LI, LI->getType(), /* GuaranteedToExecute */ true);
+    else if (StoreInst *SI = dyn_cast<StoreInst>(&I))
+      Res = HandleEndUser(SI, SI->getValueOperand()->getType(),
+                          /* GuaranteedToExecute */ true);
+    if (Res && !*Res)
+      return false;
 
     if (!isGuaranteedToTransferExecutionToSuccessor(&I))
       break;
@@ -575,36 +582,49 @@ static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
 
   // Now look at all loads of the argument. Remember the load instructions
   // for the aliasing check below.
-  SmallVector<Value *, 16> Worklist;
-  SmallPtrSet<Value *, 16> Visited;
+  SmallVector<const Use *, 16> Worklist;
+  SmallPtrSet<const Use *, 16> Visited;
   SmallVector<LoadInst *, 16> Loads;
-  auto AppendUsers = [&](Value *V) {
-    for (User *U : V->users())
-      if (Visited.insert(U).second)
-        Worklist.push_back(U);
+  auto AppendUses = [&](const Value *V) {
+    for (const Use &U : V->uses())
+      if (Visited.insert(&U).second)
+        Worklist.push_back(&U);
   };
-  AppendUsers(Arg);
+  AppendUses(Arg);
   while (!Worklist.empty()) {
-    Value *V = Worklist.pop_back_val();
+    const Use *U = Worklist.pop_back_val();
+    Value *V = U->getUser();
     if (isa<BitCastInst>(V)) {
-      AppendUsers(V);
+      AppendUses(V);
       continue;
     }
 
     if (auto *GEP = dyn_cast<GetElementPtrInst>(V)) {
       if (!GEP->hasAllConstantIndices())
         return false;
-      AppendUsers(V);
+      AppendUses(V);
       continue;
     }
 
     if (auto *LI = dyn_cast<LoadInst>(V)) {
-      if (!*HandleLoad(LI, /* GuaranteedToExecute */ false))
+      if (!*HandleEndUser(LI, LI->getType(), /* GuaranteedToExecute */ false))
         return false;
       Loads.push_back(LI);
       continue;
     }
 
+    // Stores are allowed for byval arguments
+    auto *SI = dyn_cast<StoreInst>(V);
+    if (AreStoresAllowed && SI &&
+        U->getOperandNo() == StoreInst::getPointerOperandIndex()) {
+      if (!*HandleEndUser(SI, SI->getValueOperand()->getType(),
+                          /* GuaranteedToExecute */ false))
+        return false;
+      continue;
+      // Only stores TO the argument is allowed, all the other stores are
+      // unknown users
+    }
+
     // Unknown user.
     LLVM_DEBUG(dbgs() << "ArgPromotion of " << *Arg << " failed: "
                       << "unknown user " << *V << "\n");
@@ -630,8 +650,6 @@ static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
        [](const auto &A, const auto &B) { return A.first < B.first; });
 
   // Make sure the parts are non-overlapping.
-  // TODO: As we're doing pure load promotion here, overlap should be fine from
-  // a correctness perspective. Profitability is less obvious though.
   int64_t Offset = ArgPartsVec[0].first;
   for (const auto &Pair : ArgPartsVec) {
     if (Pair.first < Offset)
@@ -640,6 +658,12 @@ static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
     Offset = Pair.first + DL.getTypeStoreSize(Pair.second.Ty);
   }
 
+  // If store instructions are allowed, the path from the entry of the function
+  // to each load may be not free of instructions that potentially invalidate
+  // the load, and this is an admissible situation.
+  if (AreStoresAllowed)
+    return true;
+
   // Okay, now we know that the argument is only used by load instructions, and
   // it is safe to unconditionally perform all of them. Use alias analysis to
   // check to see if the pointer is guaranteed to not be modified from entry of
@@ -712,40 +736,6 @@ bool ArgumentPromotionPass::isDenselyPacked(Type *Ty, const DataLayout &DL) {
   return true;
 }
 
-/// Checks if the padding bytes of an argument could be accessed.
-static bool canPaddingBeAccessed(Argument *Arg) {
-  assert(Arg->hasByValAttr());
-
-  // Track all the pointers to the argument to make sure they are not captured.
-  SmallPtrSet<Value *, 16> PtrValues;
-  PtrValues.insert(Arg);
-
-  // Track all of the stores.
-  SmallVector<StoreInst *, 16> Stores;
-
-  // Scan through the uses recursively to make sure the pointer is always used
-  // sanely.
-  SmallVector<Value *, 16> WorkList(Arg->users());
-  while (!WorkList.empty()) {
-    Value *V = WorkList.pop_back_val();
-    if (isa<GetElementPtrInst>(V) || isa<PHINode>(V)) {
-      if (PtrValues.insert(V).second)
-        append_range(WorkList, V->users());
-    } else if (StoreInst *Store = dyn_cast<StoreInst>(V)) {
-      Stores.push_back(Store);
-    } else if (!isa<LoadInst>(V)) {
-      return true;
-    }
-  }
-
-  // Check to make sure the pointers aren't captured
-  for (StoreInst *Store : Stores)
-    if (PtrValues.count(Store->getValueOperand()))
-      return true;
-
-  return false;
-}
-
 /// Check if callers and callee agree on how promoted arguments would be
 /// passed.
 static bool areTypesABICompatible(ArrayRef<Type *> Types, const Function &F,
@@ -767,6 +757,8 @@ static bool areTypesABICompatible(ArrayRef<Type *> Types, const Function &F,
 /// calls the DoPromotion method.
 static Function *
 promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
+                 function_ref<DominatorTree &(Function &F)> DTGetter,
+                 function_ref<AssumptionCache *(Function &F)> ACGetter,
                  unsigned MaxElements,
                  Optional<function_ref<void(CallBase &OldCS, CallBase &NewCS)>>
                      ReplaceCallSite,
@@ -774,7 +766,7 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
   // Don't perform argument promotion for naked functions; otherwise we can end
   // up removing parameters that are seemingly 'not used' as they are referred
   // to in the assembly.
-  if(F->hasFnAttribute(Attribute::Naked))
+  if (F->hasFnAttribute(Attribute::Naked))
     return nullptr;
 
   // Make sure that it is local to this module.
@@ -833,7 +825,6 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
   // Check to see which arguments are promotable.  If an argument is promotable,
   // add it to ArgsToPromote.
   DenseMap<Argument *, SmallVector<OffsetAndArgPart, 4>> ArgsToPromote;
-  SmallPtrSet<Argument *, 8> ByValArgsToTransform;
   for (Argument *PtrArg : PointerArgs) {
     // Replace sret attribute with noalias. This reduces register pressure by
     // avoiding a register copy.
@@ -850,6 +841,7 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
 
     // If we can promote the pointer to its value.
     SmallVector<OffsetAndArgPart, 4> ArgParts;
+
     if (findArgParts(PtrArg, DL, AAR, MaxElements, IsRecursive, ArgParts)) {
       SmallVector<Type *, 4> Types;
       for (const auto &Pair : ArgParts)
@@ -857,56 +849,15 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
 
       if (areTypesABICompatible(Types, *F, TTI)) {
         ArgsToPromote.insert({PtrArg, std::move(ArgParts)});
-        continue;
       }
     }
-
-    // Otherwise, if this is a byval argument, and if the aggregate type is
-    // small, just pass the elements, which is always safe, if the passed value
-    // is densely packed or if we can prove the padding bytes are never
-    // accessed.
-    //
-    // Only handle arguments with specified alignment; if it's unspecified, the
-    // actual alignment of the argument is target-specific.
-    Type *ByValTy = PtrArg->getParamByValType();
-    bool IsSafeToPromote =
-        ByValTy && PtrArg->getParamAlign() &&
-        (ArgumentPromotionPass::isDenselyPacked(ByValTy, DL) ||
-         !canPaddingBeAccessed(PtrArg));
-    if (!IsSafeToPromote) {
-      LLVM_DEBUG(dbgs() << "ArgPromotion disables passing the elements of"
-                        << " the argument '" << PtrArg->getName()
-                        << "' because it is not safe.\n");
-      continue;
-    }
-    if (StructType *STy = dyn_cast<StructType>(ByValTy)) {
-      if (MaxElements > 0 && STy->getNumElements() > MaxElements) {
-        LLVM_DEBUG(dbgs() << "ArgPromotion disables passing the elements of"
-                          << " the argument '" << PtrArg->getName()
-                          << "' because it would require adding more"
-                          << " than " << MaxElements
-                          << " arguments to the function.\n");
-        continue;
-      }
-      SmallVector<Type *, 4> Types;
-      append_range(Types, STy->elements());
-
-      // If all the elements are single-value types, we can promote it.
-      bool AllSimple =
-          all_of(Types, [](Type *Ty) { return Ty->isSingleValueType(); });
-
-      // Safe to transform. Passing the elements as a scalar will allow sroa to
-      // hack on the new alloca we introduce.
-      if (AllSimple && areTypesABICompatible(Types, *F, TTI))
-        ByValArgsToTransform.insert(PtrArg);
-    }
   }
 
   // No promotable pointer arguments.
-  if (ArgsToPromote.empty() && ByValArgsToTransform.empty())
+  if (ArgsToPromote.empty())
     return nullptr;
 
-  return doPromotion(F, ArgsToPromote, ByValArgsToTransform, ReplaceCallSite);
+  return doPromotion(F, DTGetter, ACGetter, ArgsToPromote, ReplaceCallSite);
 }
 
 PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
@@ -933,9 +884,19 @@ PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
         return FAM.getResult<AAManager>(F);
       };
 
-      const TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(OldF);
-      Function *NewF = promoteArguments(&OldF, AARGetter, MaxElements, None,
-                                        TTI, IsRecursive);
+      auto DTGetter = [&](Function &F) -> DominatorTree & {
+        assert(&F != &OldF && "Called with the obsolete function!");
+        return FAM.getResult<DominatorTreeAnalysis>(F);
+      };
+
+      auto ACGetter = [&](Function &F) -> AssumptionCache * {
+        assert(&F != &OldF && "Called with the obsolete function!");
+        return &FAM.getResult<AssumptionAnalysis>(F);
+      };
+
+      const auto &TTI = FAM.getResult<TargetIRAnalysis>(OldF);
+      Function *NewF = promoteArguments(&OldF, AARGetter, DTGetter, ACGetter,
+                                        MaxElements, None, TTI, IsRecursive);
       if (!NewF)
         continue;
       LocalChange = true;

diff  --git a/llvm/test/Transforms/ArgumentPromotion/attrs.ll b/llvm/test/Transforms/ArgumentPromotion/attrs.ll
index 3365199d95535..9e076fb301a97 100644
--- a/llvm/test/Transforms/ArgumentPromotion/attrs.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/attrs.ll
@@ -3,25 +3,14 @@
 
 %struct.ss = type { i32, i64 }
 
-; Don't drop 'byval' on %X here.
 define internal void @f(%struct.ss* byval(%struct.ss) align 4 %b, i32* byval(i32) align 4 %X, i32 %i) nounwind {
 ; CHECK-LABEL: define {{[^@]+}}@f
-; CHECK-SAME: (i32 [[B_0:%.*]], i64 [[B_1:%.*]], i32* byval(i32) align 4 [[X:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: (i32 [[B_0:%.*]], i32 [[X:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
-; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    store i32 [[B_0]], i32* [[DOT0]], align 4
-; CHECK-NEXT:    [[DOT1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 1
-; CHECK-NEXT:    store i64 [[B_1]], i64* [[DOT1]], align 4
-; CHECK-NEXT:    [[TEMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    [[TEMP1:%.*]] = load i32, i32* [[TEMP]], align 4
-; CHECK-NEXT:    [[TEMP2:%.*]] = add i32 [[TEMP1]], 1
-; CHECK-NEXT:    store i32 [[TEMP2]], i32* [[TEMP]], align 4
-; CHECK-NEXT:    store i32 0, i32* [[X]], align 4
+; CHECK-NEXT:    [[TEMP:%.*]] = add i32 [[B_0]], 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
-
   %temp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
   %temp1 = load i32, i32* %temp, align 4
   %temp2 = add i32 %temp1, 1
@@ -41,11 +30,10 @@ define i32 @test(i32* %X) {
 ; CHECK-NEXT:    store i32 1, i32* [[TEMP1]], align 8
 ; CHECK-NEXT:    [[TEMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
 ; CHECK-NEXT:    store i64 2, i64* [[TEMP4]], align 4
-; CHECK-NEXT:    [[S_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
+; CHECK-NEXT:    [[S_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i64 0, i32 0
 ; CHECK-NEXT:    [[S_0_VAL:%.*]] = load i32, i32* [[S_0]], align 4
-; CHECK-NEXT:    [[S_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[S_1_VAL:%.*]] = load i64, i64* [[S_1]], align 4
-; CHECK-NEXT:    call void @f(i32 [[S_0_VAL]], i64 [[S_1_VAL]], i32* byval(i32) align 4 [[X]], i32 zeroext 0)
+; CHECK-NEXT:    [[X_VAL:%.*]] = load i32, i32* [[X]], align 4
+; CHECK-NEXT:    call void @f(i32 [[S_0_VAL]], i32 [[X_VAL]], i32 zeroext 0)
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:

diff  --git a/llvm/test/Transforms/ArgumentPromotion/byval-2.ll b/llvm/test/Transforms/ArgumentPromotion/byval-2.ll
index 42b7d6d31905d..199f089932317 100644
--- a/llvm/test/Transforms/ArgumentPromotion/byval-2.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/byval-2.ll
@@ -2,24 +2,14 @@
 ; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
 ; Arg promotion eliminates the struct argument.
-; FIXME: We should eliminate the i32* argument.
 
 %struct.ss = type { i32, i64 }
 
 define internal void @f(%struct.ss* byval(%struct.ss) align 8 %b, i32* byval(i32) align 4 %X) nounwind  {
 ; CHECK-LABEL: define {{[^@]+}}@f
-; CHECK-SAME: (i32 [[B_0:%.*]], i64 [[B_1:%.*]], i32* byval(i32) align 4 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: (i32 [[B_0:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
-; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    store i32 [[B_0]], i32* [[DOT0]], align 8
-; CHECK-NEXT:    [[DOT1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 1
-; CHECK-NEXT:    store i64 [[B_1]], i64* [[DOT1]], align 4
-; CHECK-NEXT:    [[TEMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    [[TEMP1:%.*]] = load i32, i32* [[TEMP]], align 4
-; CHECK-NEXT:    [[TEMP2:%.*]] = add i32 [[TEMP1]], 1
-; CHECK-NEXT:    store i32 [[TEMP2]], i32* [[TEMP]], align 4
-; CHECK-NEXT:    store i32 0, i32* [[X]], align 4
+; CHECK-NEXT:    [[TEMP:%.*]] = add i32 [[B_0]], 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -41,11 +31,10 @@ define i32 @test(i32* %X) {
 ; CHECK-NEXT:    store i32 1, i32* [[TEMP1]], align 8
 ; CHECK-NEXT:    [[TEMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
 ; CHECK-NEXT:    store i64 2, i64* [[TEMP4]], align 4
-; CHECK-NEXT:    [[S_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
-; CHECK-NEXT:    [[S_0_VAL:%.*]] = load i32, i32* [[S_0]], align 8
-; CHECK-NEXT:    [[S_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[S_1_VAL:%.*]] = load i64, i64* [[S_1]], align 4
-; CHECK-NEXT:    call void @f(i32 [[S_0_VAL]], i64 [[S_1_VAL]], i32* byval(i32) align 4 [[X]])
+; CHECK-NEXT:    [[S_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i64 0, i32 0
+; CHECK-NEXT:    [[S_0_VAL:%.*]] = load i32, i32* [[S_0]], align 4
+; CHECK-NEXT:    [[X_VAL:%.*]] = load i32, i32* [[X]], align 4
+; CHECK-NEXT:    call void @f(i32 [[S_0_VAL]], i32 [[X_VAL]])
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:

diff  --git a/llvm/test/Transforms/ArgumentPromotion/byval-through-pointer-promotion.ll b/llvm/test/Transforms/ArgumentPromotion/byval-with-padding.ll
similarity index 100%
rename from llvm/test/Transforms/ArgumentPromotion/byval-through-pointer-promotion.ll
rename to llvm/test/Transforms/ArgumentPromotion/byval-with-padding.ll

diff  --git a/llvm/test/Transforms/ArgumentPromotion/byval.ll b/llvm/test/Transforms/ArgumentPromotion/byval.ll
index 2416345400c3d..0b02e8b129b94 100644
--- a/llvm/test/Transforms/ArgumentPromotion/byval.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/byval.ll
@@ -7,17 +7,9 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:1
 
 define internal void @f(%struct.ss* byval(%struct.ss) align 4 %b) nounwind  {
 ; CHECK-LABEL: define {{[^@]+}}@f
-; CHECK-SAME: (i32 [[B_0:%.*]], i64 [[B_1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: (i32 [[B_0:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
-; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    store i32 [[B_0]], i32* [[DOT0]], align 4
-; CHECK-NEXT:    [[DOT1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 1
-; CHECK-NEXT:    store i64 [[B_1]], i64* [[DOT1]], align 4
-; CHECK-NEXT:    [[TEMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    [[TEMP1:%.*]] = load i32, i32* [[TEMP]], align 4
-; CHECK-NEXT:    [[TEMP2:%.*]] = add i32 [[TEMP1]], 1
-; CHECK-NEXT:    store i32 [[TEMP2]], i32* [[TEMP]], align 4
+; CHECK-NEXT:    [[TEMP:%.*]] = add i32 [[B_0]], 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -28,20 +20,11 @@ entry:
   ret void
 }
 
-
 define internal void @g(%struct.ss* byval(%struct.ss) align 32 %b) nounwind {
 ; CHECK-LABEL: define {{[^@]+}}@g
-; CHECK-SAME: (i32 [[B_0:%.*]], i64 [[B_1:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (i32 [[B_0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_SS:%.*]], align 32
-; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    store i32 [[B_0]], i32* [[DOT0]], align 32
-; CHECK-NEXT:    [[DOT1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 1
-; CHECK-NEXT:    store i64 [[B_1]], i64* [[DOT1]], align 4
-; CHECK-NEXT:    [[TEMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    [[TEMP1:%.*]] = load i32, i32* [[TEMP]], align 4
-; CHECK-NEXT:    [[TEMP2:%.*]] = add i32 [[TEMP1]], 1
-; CHECK-NEXT:    store i32 [[TEMP2]], i32* [[TEMP]], align 4
+; CHECK-NEXT:    [[TEMP:%.*]] = add i32 [[B_0]], 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -75,6 +58,63 @@ entry:
   ret void
 }
 
+; Transform even if an argument is written to and then is loaded from.
+define internal void @k(%struct.ss* byval(%struct.ss) align 4 %b) nounwind  {
+; CHECK-LABEL: define {{[^@]+}}@k
+; CHECK-SAME: (i32 [[B_0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TEMP:%.*]] = add i32 [[B_0]], 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %temp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
+  %temp1 = load i32, i32* %temp, align 4
+  %temp2 = add i32 %temp1, 1
+  store i32 %temp2, i32* %temp, align 4
+  %temp3 = load i32, i32* %temp, align 4
+  ret void
+}
+
+; Transform even if a store instruction is the single user.
+define internal void @l(%struct.ss* byval(%struct.ss) align 4 %b) nounwind  {
+; CHECK-LABEL: define {{[^@]+}}@l
+; CHECK-SAME: (i32 [[B_0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %temp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
+  store i32 1, i32* %temp, align 4
+  ret void
+}
+
+; Transform all the arguments creating the required number of 'alloca's and
+; then optimize them out.
+define internal void @m(%struct.ss* byval(%struct.ss) align 4 %b, %struct.ss* byval(%struct.ss) align 4 %c) nounwind  {
+; CHECK-LABEL: define {{[^@]+}}@m
+; CHECK-SAME: (i32 [[B_0:%.*]], i32 [[C_0:%.*]], i64 [[C_1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TEMP2:%.*]] = add i32 [[B_0]], 1
+; CHECK-NEXT:    [[TEMP6:%.*]] = add i64 [[C_1]], 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %temp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
+  %temp1 = load i32, i32* %temp, align 4
+  %temp2 = add i32 %temp1, 1
+  store i32 %temp2, i32* %temp, align 4
+
+  %temp3 = getelementptr %struct.ss, %struct.ss* %c, i32 0, i32 0
+  store i32 %temp2, i32* %temp3, align 4
+  
+  %temp4 = getelementptr %struct.ss, %struct.ss* %c, i32 0, i32 1
+  %temp5 = load i64, i64* %temp4, align 8
+  %temp6 = add i64 %temp5, 1
+  store i64 %temp6, i64* %temp4, align 8
+
+  ret void
+}
+
 define i32 @main() nounwind  {
 ; CHECK-LABEL: define {{[^@]+}}@main
 ; CHECK-SAME: () #[[ATTR0]] {
@@ -84,17 +124,26 @@ define i32 @main() nounwind  {
 ; CHECK-NEXT:    store i32 1, i32* [[TEMP1]], align 8
 ; CHECK-NEXT:    [[TEMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
 ; CHECK-NEXT:    store i64 2, i64* [[TEMP4]], align 4
-; CHECK-NEXT:    [[S_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
-; CHECK-NEXT:    [[S_0_VAL:%.*]] = load i32, i32* [[S_0]], align 4
-; CHECK-NEXT:    [[S_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[S_1_VAL:%.*]] = load i64, i64* [[S_1]], align 4
-; CHECK-NEXT:    call void @f(i32 [[S_0_VAL]], i64 [[S_1_VAL]])
-; CHECK-NEXT:    [[S_01:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
-; CHECK-NEXT:    [[S_01_VAL:%.*]] = load i32, i32* [[S_01]], align 32
-; CHECK-NEXT:    [[S_12:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[S_12_VAL:%.*]] = load i64, i64* [[S_12]], align 4
-; CHECK-NEXT:    call void @g(i32 [[S_01_VAL]], i64 [[S_12_VAL]])
+; CHECK-NEXT:    [[S_0_0_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i64 0, i32 0
+; CHECK-NEXT:    [[S_0_0_0_VAL:%.*]] = load i32, i32* [[S_0_0_0]], align 4
+; CHECK-NEXT:    call void @f(i32 [[S_0_0_0_VAL]])
+; CHECK-NEXT:    [[S_1_0_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i64 0, i32 0
+; CHECK-NEXT:    [[S_1_0_0_VAL:%.*]] = load i32, i32* [[S_1_0_0]], align 4
+; CHECK-NEXT:    call void @g(i32 [[S_1_0_0_VAL]])
 ; CHECK-NEXT:    call void @h(%struct.ss* byval([[STRUCT_SS]]) [[S]])
+; CHECK-NEXT:    [[S_2_0_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i64 0, i32 0
+; CHECK-NEXT:    [[S_2_0_0_VAL:%.*]] = load i32, i32* [[S_2_0_0]], align 4
+; CHECK-NEXT:    call void @k(i32 [[S_2_0_0_VAL]])
+; CHECK-NEXT:    [[S_3_0_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i64 0, i32 0
+; CHECK-NEXT:    [[S_3_0_0_VAL:%.*]] = load i32, i32* [[S_3_0_0]], align 4
+; CHECK-NEXT:    call void @l(i32 [[S_3_0_0_VAL]])
+; CHECK-NEXT:    [[S_4_0_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i64 0, i32 0
+; CHECK-NEXT:    [[S_4_0_0_VAL:%.*]] = load i32, i32* [[S_4_0_0]], align 4
+; CHECK-NEXT:    [[S_4_1_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i64 0, i32 0
+; CHECK-NEXT:    [[S_4_1_0_VAL:%.*]] = load i32, i32* [[S_4_1_0]], align 4
+; CHECK-NEXT:    [[S_4_1_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i64 0, i32 1
+; CHECK-NEXT:    [[S_4_1_1_VAL:%.*]] = load i64, i64* [[S_4_1_1]], align 8
+; CHECK-NEXT:    call void @m(i32 [[S_4_0_0_VAL]], i32 [[S_4_1_0_VAL]], i64 [[S_4_1_1_VAL]])
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
@@ -106,7 +155,8 @@ entry:
   call void @f(%struct.ss* byval(%struct.ss) align 4 %S) nounwind
   call void @g(%struct.ss* byval(%struct.ss) align 32 %S) nounwind
   call void @h(%struct.ss* byval(%struct.ss) %S) nounwind
+  call void @k(%struct.ss* byval(%struct.ss) align 4 %S) nounwind
+  call void @l(%struct.ss* byval(%struct.ss) align 4 %S) nounwind
+  call void @m(%struct.ss* byval(%struct.ss) align 4 %S, %struct.ss* byval(%struct.ss) align 4 %S) nounwind
   ret i32 0
 }
-
-

diff  --git a/llvm/test/Transforms/ArgumentPromotion/dbg.ll b/llvm/test/Transforms/ArgumentPromotion/dbg.ll
index 3df10ab90bdff..a9d89d5864242 100644
--- a/llvm/test/Transforms/ArgumentPromotion/dbg.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/dbg.ll
@@ -17,22 +17,20 @@ define internal void @test(i32** %X) !dbg !2 {
 
 %struct.pair = type { i32, i32 }
 
+; Do not promote because there is a store of the pointer %P itself. Even if %P
+; had been promoted as a byval argument, the result would have been not
+; optimizable for SROA.
 define internal void @test_byval(%struct.pair* byval(%struct.pair) align 4 %P) {
 ; CHECK-LABEL: define {{[^@]+}}@test_byval
-; CHECK-SAME: (i32 [[P_0:%.*]], i32 [[P_1:%.*]]) {
-; CHECK-NEXT:    [[P:%.*]] = alloca [[STRUCT_PAIR:%.*]], align 4
-; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[STRUCT_PAIR]], [[STRUCT_PAIR]]* [[P]], i32 0, i32 0
-; CHECK-NEXT:    store i32 [[P_0]], i32* [[DOT0]], align 4
-; CHECK-NEXT:    [[DOT1:%.*]] = getelementptr [[STRUCT_PAIR]], [[STRUCT_PAIR]]* [[P]], i32 0, i32 1
-; CHECK-NEXT:    store i32 [[P_1]], i32* [[DOT1]], align 4
+; CHECK-SAME: ([[STRUCT_PAIR:%.*]]* byval([[STRUCT_PAIR]]) align 4 [[P:%.*]]) {
 ; CHECK-NEXT:    [[SINK:%.*]] = alloca i32*, align 8
-; CHECK-NEXT:    [[DOT2:%.*]] = getelementptr [[STRUCT_PAIR]], [[STRUCT_PAIR]]* [[P]], i32 0, i32 0
-; CHECK-NEXT:    store i32* [[DOT2]], i32** [[SINK]], align 8
+; CHECK-NEXT:    [[TEMP:%.*]] = getelementptr [[STRUCT_PAIR]], [[STRUCT_PAIR]]* [[P]], i32 0, i32 0
+; CHECK-NEXT:    store i32* [[TEMP]], i32** [[SINK]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %1 = alloca i32*, align 8
   %2 = getelementptr %struct.pair, %struct.pair* %P, i32 0, i32 0
-  store i32* %2, i32** %1, align 8 ; to protect from "usual" promotion
+  store i32* %2, i32** %1, align 8 ; to protect from promotion
   ret void
 }
 
@@ -42,11 +40,7 @@ define void @caller(i32** %Y, %struct.pair* %P) {
 ; CHECK-NEXT:    [[Y_VAL:%.*]] = load i32*, i32** [[Y]], align 8, !dbg [[DBG4:![0-9]+]]
 ; CHECK-NEXT:    [[Y_VAL_VAL:%.*]] = load i32, i32* [[Y_VAL]], align 8, !dbg [[DBG4]]
 ; CHECK-NEXT:    call void @test(i32 [[Y_VAL_VAL]]), !dbg [[DBG4]]
-; CHECK-NEXT:    [[P_0:%.*]] = getelementptr [[STRUCT_PAIR:%.*]], %struct.pair* [[P]], i32 0, i32 0, !dbg [[DBG5:![0-9]+]]
-; CHECK-NEXT:    [[P_0_VAL:%.*]] = load i32, i32* [[P_0]], align 4, !dbg [[DBG5]]
-; CHECK-NEXT:    [[P_1:%.*]] = getelementptr [[STRUCT_PAIR]], %struct.pair* [[P]], i32 0, i32 1, !dbg [[DBG5]]
-; CHECK-NEXT:    [[P_1_VAL:%.*]] = load i32, i32* [[P_1]], align 4, !dbg [[DBG5]]
-; CHECK-NEXT:    call void @test_byval(i32 [[P_0_VAL]], i32 [[P_1_VAL]]), !dbg [[DBG5]]
+; CHECK-NEXT:    call void @test_byval([[STRUCT_PAIR]]* byval([[STRUCT_PAIR]]) align 4 [[P]]), !dbg [[DBG5:![0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
   call void @test(i32** %Y), !dbg !1

diff  --git a/llvm/test/Transforms/ArgumentPromotion/fp80.ll b/llvm/test/Transforms/ArgumentPromotion/fp80.ll
index a0143d31cd934..90b6998495964 100644
--- a/llvm/test/Transforms/ArgumentPromotion/fp80.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/fp80.ll
@@ -14,23 +14,23 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define void @run() {
 ; CHECK-LABEL: define {{[^@]+}}@run() {
-; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast %union.u* bitcast (%struct.s* @b to %union.u*) to i8*
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 10
 ; CHECK-NEXT:    [[DOTVAL:%.*]] = load i8, i8* [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call i8 @UseLongDoubleUnsafely(i8 [[DOTVAL]])
-; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[UNION_U:%.*]], %union.u* bitcast (%struct.s* @b to %union.u*), i32 0, i32 0
+; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[UNION_U:%.*]], %union.u* bitcast (%struct.s* @b to %union.u*), i64 0, i32 0
 ; CHECK-NEXT:    [[DOT0_VAL:%.*]] = load x86_fp80, x86_fp80* [[DOT0]], align 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_fp80 @UseLongDoubleSafely(x86_fp80 [[DOT0_VAL]])
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.Foo* @a to i64*
-; CHECK-NEXT:    [[A_VAL:%.*]] = load i64, i64* [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @AccessPaddingOfStruct(i64 [[A_VAL]])
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @CaptureAStruct(%struct.Foo* byval([[STRUCT_FOO:%.*]]) @a)
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_fp80 @UseLongDoubleSafelyNoPromotion(%union.u* byval(%union.u) align 16 bitcast (%struct.s* @b to %union.u*))
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.Foo* @a to i64*
+; CHECK-NEXT:    [[A_VAL:%.*]] = load i64, i64* [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @AccessPaddingOfStruct(i64 [[A_VAL]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @CaptureAStruct(%struct.Foo* byval([[STRUCT_FOO:%.*]]) @a)
 ; CHECK-NEXT:    ret void
 ;
-entry:
   tail call i8 @UseLongDoubleUnsafely(%union.u* byval(%union.u) align 16 bitcast (%struct.s* @b to %union.u*))
   tail call x86_fp80 @UseLongDoubleSafely(%union.u* byval(%union.u) align 16 bitcast (%struct.s* @b to %union.u*))
+  tail call x86_fp80 @UseLongDoubleSafelyNoPromotion(%union.u* byval(%union.u) align 16 bitcast (%struct.s* @b to %union.u*))
   call i64 @AccessPaddingOfStruct(%struct.Foo* byval(%struct.Foo) @a)
   call i64 @CaptureAStruct(%struct.Foo* byval(%struct.Foo) @a)
   ret void
@@ -38,11 +38,9 @@ entry:
 
 define internal i8 @UseLongDoubleUnsafely(%union.u* byval(%union.u) align 16 %arg) {
 ; CHECK-LABEL: define {{[^@]+}}@UseLongDoubleUnsafely
-; CHECK-SAME: (i8 [[ARG_10_VAL:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i8 [[ARG_10_VAL]]
+; CHECK-SAME: (i8 [[ARG_0_VAL:%.*]]) {
+; CHECK-NEXT:    ret i8 [[ARG_0_VAL]]
 ;
-entry:
   %bitcast = bitcast %union.u* %arg to %struct.s*
   %gep = getelementptr inbounds %struct.s, %struct.s* %bitcast, i64 0, i32 2
   %result = load i8, i8* %gep
@@ -51,23 +49,30 @@ entry:
 
 define internal x86_fp80 @UseLongDoubleSafely(%union.u* byval(%union.u) align 16 %arg) {
 ; CHECK-LABEL: define {{[^@]+}}@UseLongDoubleSafely
-; CHECK-SAME: (x86_fp80 [[ARG_0:%.*]]) {
-; CHECK-NEXT:    [[ARG:%.*]] = alloca [[UNION_U:%.*]], align 16
-; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[UNION_U]], [[UNION_U]]* [[ARG]], i32 0, i32 0
-; CHECK-NEXT:    store x86_fp80 [[ARG_0]], x86_fp80* [[DOT0]], align 16
+; CHECK-SAME: (x86_fp80 [[ARG_0_VAL:%.*]]) {
+; CHECK-NEXT:    ret x86_fp80 [[ARG_0_VAL]]
+;
+  %gep = getelementptr inbounds %union.u, %union.u* %arg, i64 0, i32 0
+  %fp80 = load x86_fp80, x86_fp80* %gep
+  ret x86_fp80 %fp80
+}
+
+define internal x86_fp80 @UseLongDoubleSafelyNoPromotion(%union.u* byval(%union.u) align 16 %arg) {
+; CHECK-LABEL: define {{[^@]+}}@UseLongDoubleSafelyNoPromotion
+; CHECK-SAME: ([[UNION_U]]* byval([[UNION_U]]) align 16 [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[UNION_U]], [[UNION_U]]* [[ARG]], i64 0, i32 0
-; CHECK-NEXT:    [[IDX_P:%.*]] = alloca i64, align 8
-; CHECK-NEXT:    store i64 0, i64* [[IDX_P]], align 8
-; CHECK-NEXT:    [[IDX:%.*]] = load i64, i64* [[IDX_P]], align 8
+; CHECK-NEXT:    [[TMP_IDX:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    store i64 0, i64* [[TMP_IDX]], align 8
+; CHECK-NEXT:    [[IDX:%.*]] = load i64, i64* [[TMP_IDX]], align 8
 ; CHECK-NEXT:    [[GEP_IDX:%.*]] = getelementptr inbounds [[UNION_U]], [[UNION_U]]* [[ARG]], i64 [[IDX]], i32 0
-; CHECK-NEXT:    [[FP80:%.*]] = load x86_fp80, x86_fp80* [[GEP]], align 16
+; CHECK-NEXT:    [[FP80:%.*]] = load x86_fp80, x86_fp80* [[GEP]]
 ; CHECK-NEXT:    ret x86_fp80 [[FP80]]
 ;
   %gep = getelementptr inbounds %union.u, %union.u* %arg, i64 0, i32 0
   %idx_slot = alloca i64, align 8
   store i64 0, i64* %idx_slot, align 8
   %idx = load i64, i64* %idx_slot, align 8
-  %gep_idx = getelementptr inbounds %union.u, %union.u* %arg, i64 %idx, i32 0 ; to protect from "usual" promotion
+  %gep_idx = getelementptr inbounds %union.u, %union.u* %arg, i64 %idx, i32 0 ; to protect from promotion
   %fp80 = load x86_fp80, x86_fp80* %gep
   ret x86_fp80 %fp80
 }

diff  --git a/llvm/test/Transforms/ArgumentPromotion/metadata.ll b/llvm/test/Transforms/ArgumentPromotion/metadata.ll
index 3549bcb8f32df..91ec033116012 100644
--- a/llvm/test/Transforms/ArgumentPromotion/metadata.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/metadata.ll
@@ -7,6 +7,8 @@ declare void @use.p32(i32*)
 define internal void @callee(i32* %p1, i32** %p2, i32** %p3, i32** %p4, i32** %p5, i32** %p6) {
 ; CHECK-LABEL: define {{[^@]+}}@callee
 ; CHECK-SAME: (i32 [[P1_0_VAL:%.*]], i32* [[P2_0_VAL:%.*]], i32* [[P3_0_VAL:%.*]], i32* [[P4_0_VAL:%.*]], i32* [[P5_0_VAL:%.*]], i32* [[P6_0_VAL:%.*]]) {
+; CHECK-NEXT:    [[IS_NOT_NULL:%.*]] = icmp ne i32* [[P2_0_VAL]], null
+; CHECK-NEXT:    call void @llvm.assume(i1 [[IS_NOT_NULL]])
 ; CHECK-NEXT:    call void @use.i32(i32 [[P1_0_VAL]])
 ; CHECK-NEXT:    call void @use.p32(i32* [[P2_0_VAL]])
 ; CHECK-NEXT:    call void @use.p32(i32* [[P3_0_VAL]])
@@ -51,6 +53,8 @@ define internal i32* @callee_conditional(i1 %c, i32** dereferenceable(8) align 8
 ; CHECK-SAME: (i1 [[C:%.*]], i32* [[P_0_VAL:%.*]]) {
 ; CHECK-NEXT:    br i1 [[C]], label [[IF:%.*]], label [[ELSE:%.*]]
 ; CHECK:       if:
+; CHECK-NEXT:    [[IS_NOT_NULL:%.*]] = icmp ne i32* [[P_0_VAL]], null
+; CHECK-NEXT:    call void @llvm.assume(i1 [[IS_NOT_NULL]])
 ; CHECK-NEXT:    ret i32* [[P_0_VAL]]
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i32* null

diff  --git a/llvm/test/Transforms/ArgumentPromotion/store-after-load.ll b/llvm/test/Transforms/ArgumentPromotion/store-after-load.ll
new file mode 100644
index 0000000000000..117c2c3a5b49c
--- /dev/null
+++ b/llvm/test/Transforms/ArgumentPromotion/store-after-load.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
+
+; Store instructions are allowed users for byval arguments only.
+define internal void @callee(i32* %arg) nounwind {
+; CHECK-LABEL: define {{[^@]+}}@callee
+; CHECK-SAME: (i32* [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TEMP:%.*]] = load i32, i32* [[ARG]], align 4
+; CHECK-NEXT:    [[SUM:%.*]] = add i32 [[TEMP]], 1
+; CHECK-NEXT:    store i32 [[SUM]], i32* [[ARG]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %temp = load i32, i32* %arg, align 4
+  %sum = add i32 %temp, 1
+  store i32 %sum, i32* %arg, align 4
+  ret void
+}
+
+define i32 @caller(i32* %arg) nounwind {
+; CHECK-LABEL: define {{[^@]+}}@caller
+; CHECK-SAME: (i32* [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @callee(i32* [[ARG]]) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  call void @callee(i32* %arg) nounwind
+  ret i32 0
+}

diff  --git a/llvm/test/Transforms/ArgumentPromotion/store-into-inself.ll b/llvm/test/Transforms/ArgumentPromotion/store-into-inself.ll
new file mode 100644
index 0000000000000..7d7099003dc77
--- /dev/null
+++ b/llvm/test/Transforms/ArgumentPromotion/store-into-inself.ll
@@ -0,0 +1,102 @@
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
+
+target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+
+%struct.ss = type { i32, i64 }
+
+define internal void @f(ptr byval(ptr) align 4 %p) nounwind  {
+; CHECK-LABEL: define {{[^@]+}}@f
+; CHECK-SAME: (ptr byval(ptr) align 4 [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store ptr [[P]], ptr [[P]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  store ptr %p, ptr %p
+  ret void
+}
+
+define internal void @g(ptr byval(ptr) align 4 %p) nounwind  {
+; CHECK-LABEL: define {{[^@]+}}@g
+; CHECK-SAME: (ptr byval(ptr) align 4 [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[P]], i64 4
+; CHECK-NEXT:    store ptr [[P]], ptr [[P1]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p1 = getelementptr i8, ptr %p, i64 4
+  store ptr %p, ptr %p1
+  ret void
+}
+
+define internal void @h(ptr byval(ptr) align 4 %p) nounwind  {
+; CHECK-LABEL: define {{[^@]+}}@h
+; CHECK-SAME: (ptr byval(ptr) align 4 [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[P]], i64 4
+; CHECK-NEXT:    store ptr [[P1]], ptr [[P]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p1 = getelementptr i8, ptr %p, i64 4
+  store ptr %p1, ptr %p
+  ret void
+}
+
+define internal void @k(ptr byval(ptr) align 4 %p) nounwind  {
+; CHECK-LABEL: define {{[^@]+}}@k
+; CHECK-SAME: (ptr byval(ptr) align 4 [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X:%.*]] = load ptr, ptr [[P]]
+; CHECK-NEXT:    store ptr [[P]], ptr [[X]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %x = load ptr, ptr %p
+  store ptr %p, ptr %x
+  ret void
+}
+
+define internal void @l(ptr byval(ptr) align 4 %p) nounwind  {
+; CHECK-LABEL: define {{[^@]+}}@l
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %x = load ptr, ptr %p
+  store ptr %x, ptr %p
+  ret void
+}
+
+define i32 @main() nounwind  {
+; CHECK-LABEL: define {{[^@]+}}@main
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 32
+; CHECK-NEXT:    [[TEMP1:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 0
+; CHECK-NEXT:    store i32 1, ptr [[TEMP1]], align 4
+; CHECK-NEXT:    [[TEMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
+; CHECK-NEXT:    store i64 2, ptr [[TEMP4]], align 8
+; CHECK-NEXT:    call void @f(ptr byval(ptr) align 4 [[S]]) #[[ATTR0]]
+; CHECK-NEXT:    call void @g(ptr byval(ptr) align 4 [[S]]) #[[ATTR0]]
+; CHECK-NEXT:    call void @h(ptr byval(ptr) align 4 [[S]]) #[[ATTR0]]
+; CHECK-NEXT:    call void @k(ptr byval(ptr) align 4 [[S]]) #[[ATTR0]]
+; CHECK-NEXT:    [[S_VAL:%.*]] = load ptr, ptr [[S]], align 8
+; CHECK-NEXT:    call void @l() #[[ATTR0]]
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %S = alloca %struct.ss, align 32
+  %temp1 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 0
+  store i32 1, i32* %temp1, align 4
+  %temp4 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 1
+  store i64 2, i64* %temp4, align 8
+  call void @f(ptr byval(ptr) align 4 %S) nounwind
+  call void @g(ptr byval(ptr) align 4 %S) nounwind
+  call void @h(ptr byval(ptr) align 4 %S) nounwind
+  call void @k(ptr byval(ptr) align 4 %S) nounwind
+  call void @l(ptr byval(ptr) align 4 %S) nounwind
+  ret i32 0
+}