[llvm] e3515ba - Reapply "[AMDGPU] Modify adjustInliningThreshold to also consider the cost of passing function arguments through the stack"

Mon Feb 13 04:17:50 PST 2023

Author: Janek van Oirschot
Date: 2023-02-13T12:17:43Z
New Revision: e3515ba3816b9cabeca6a3b03b90902ebcfd3c65

URL: https://github.com/llvm/llvm-project/commit/e3515ba3816b9cabeca6a3b03b90902ebcfd3c65
DIFF: https://github.com/llvm/llvm-project/commit/e3515ba3816b9cabeca6a3b03b90902ebcfd3c65.diff

LOG: Reapply "[AMDGPU] Modify adjustInliningThreshold to also consider the cost of passing function arguments through the stack"

Reapplies 142c28ffa1323e9a8d53200a22c80d5d778e0d0f as part of D140242 which got reverted due to amdgpu openmp test failures.

This diff fixes said failures by eliding most of `adjustInliningThresholdUsingCallee` for indirect calls as the callee function is unavailable for indirect calls.

Reviewed By: arsenm, #amdgpu

Differential Revision: https://reviews.llvm.org/D143498

Added: 
    llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-argument-i64.ll
    llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-argument.ll
    llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-array-ptr-argument.ll
    llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-indirect-call-argument.ll
    llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-ptr-argument.ll
    llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-struct-argument.ll
    llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-vector-ptr-argument.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 87e292fc76136..e2c647f196fdc 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -17,8 +17,10 @@
 #include "AMDGPUTargetTransformInfo.h"
 #include "AMDGPUTargetMachine.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/PatternMatch.h"
@@ -1167,10 +1169,57 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
   return true;
 }
 
+static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB,
+                                                   const SITargetLowering *TLI,
+                                                   const GCNTTIImpl *TTIImpl) {
+  const int NrOfSGPRUntilSpill = 26;
+  const int NrOfVGPRUntilSpill = 32;
+
+  const DataLayout &DL = TTIImpl->getDataLayout();
+
+  unsigned adjustThreshold = 0;
+  int SGPRsInUse = 0;
+  int VGPRsInUse = 0;
+  for (const Use &A : CB->args()) {
+    SmallVector<EVT, 4> ValueVTs;
+    ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
+    for (auto ArgVT : ValueVTs) {
+      unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
+          CB->getContext(), CB->getCallingConv(), ArgVT);
+      if (AMDGPU::isArgPassedInSGPR(CB, CB->getArgOperandNo(&A)))
+        SGPRsInUse += CCRegNum;
+      else
+        VGPRsInUse += CCRegNum;
+    }
+  }
+
+  // The cost of passing function arguments through the stack:
+  //  1 instruction to put a function argument on the stack in the caller.
+  //  1 instruction to take a function argument from the stack in callee.
+  //  1 instruction is explicitly take care of data dependencies in callee
+  //  function.
+  InstructionCost ArgStackCost(1);
+  ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
+      Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
+      AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency);
+  ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
+      Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
+      AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency);
+
+  // The penalty cost is computed relative to the cost of instructions and does
+  // not model any storage costs.
+  adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
+                     *ArgStackCost.getValue() * InlineConstants::getInstrCost();
+  adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
+                     *ArgStackCost.getValue() * InlineConstants::getInstrCost();
+  return adjustThreshold;
+}
+
 unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
   // If we have a pointer to private array passed into a function
   // it will not be optimized out, leaving scratch usage.
   // Increase the inline threshold to allow inlining in this case.
+  unsigned adjustThreshold = 0;
   uint64_t AllocaSize = 0;
   SmallPtrSet<const AllocaInst *, 8> AIVisited;
   for (Value *PtrArg : CB->args()) {
@@ -1192,9 +1241,10 @@ unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
       }
     }
   }
-  if (AllocaSize)
-    return ArgAllocaCost;
-  return 0;
+  adjustThreshold +=
+      adjustInliningThresholdUsingCallee(CB, TLI, this);
+  adjustThreshold += AllocaSize ? ArgAllocaCost : AllocaSize;
+  return adjustThreshold;
 }
 
 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 2acd167534ccc..f0667645c2bc8 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2481,10 +2481,35 @@ bool isArgPassedInSGPR(const Argument *A) {
   case CallingConv::AMDGPU_PS:
   case CallingConv::AMDGPU_CS:
   case CallingConv::AMDGPU_Gfx:
-    // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
-    // Everything else is in VGPRs.
-    return F->getAttributes().hasParamAttr(A->getArgNo(), Attribute::InReg) ||
-           F->getAttributes().hasParamAttr(A->getArgNo(), Attribute::ByVal);
+    // For non-compute shaders, SGPR inputs are marked with either inreg or
+    // byval. Everything else is in VGPRs.
+    return A->hasAttribute(Attribute::InReg) ||
+           A->hasAttribute(Attribute::ByVal);
+  default:
+    // TODO: Should calls support inreg for SGPR inputs?
+    return false;
+  }
+}
+
+bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo) {
+  // Arguments to compute shaders are never a source of divergence.
+  CallingConv::ID CC = CB->getCallingConv();
+  switch (CC) {
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+    return true;
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_LS:
+  case CallingConv::AMDGPU_HS:
+  case CallingConv::AMDGPU_ES:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+  case CallingConv::AMDGPU_Gfx:
+    // For non-compute shaders, SGPR inputs are marked with either inreg or
+    // byval. Everything else is in VGPRs.
+    return CB->paramHasAttr(ArgNo, Attribute::InReg) ||
+           CB->paramHasAttr(ArgNo, Attribute::ByVal);
   default:
     // TODO: Should calls support inreg for SGPR inputs?
     return false;

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 96d8cb3031dd8..ef332e162de70 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -12,6 +12,7 @@
 #include "SIDefines.h"
 #include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Alignment.h"
 #include <array>
@@ -1260,6 +1261,8 @@ bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi);
 
 bool isArgPassedInSGPR(const Argument *Arg);
 
+bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo);
+
 LLVM_READONLY
 bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
                                       int64_t EncodedOffset);

diff  --git a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-argument-i64.ll b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-argument-i64.ll
new file mode 100644
index 0000000000000..af21cd686c2b5
--- /dev/null
+++ b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-argument-i64.ll
@@ -0,0 +1,100 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=inline -inline-cost-full=true -inline-threshold=0 -inline-instr-cost=5 -inline-call-penalty=0 -debug-only=inline < %s 2>&1 | FileCheck %s
+
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall1 = call noundef i64 @non_inlining_call
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall2 = call noundef i64 @non_inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall1 = call noundef i64 @inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall2 = call noundef i64 @inlining_call
+
+define noundef i64 @non_inlining_call(i64 noundef %a0, i64 noundef %b0, i64 noundef %c0, i64 noundef %d0, i64 noundef %e0, i64 noundef %f0, i64 noundef %g0, i64 noundef %h0, i64 noundef %i0, i64 noundef %j0, i64 noundef %k0, i64 noundef %l0, i64 noundef %m0, i64 noundef %n0, i64 noundef %o0, i64 noundef %p0) {
+entry:
+  %xor = xor i64 %a0, %b0
+  %xor1 = xor i64 %xor, %c0
+  %xor2 = xor i64 %xor1, %d0
+  %xor3 = xor i64 %xor2, %e0
+  %xor4 = xor i64 %xor3, %f0
+  %xor5 = xor i64 %xor4, %g0
+  %xor6 = xor i64 %xor5, %h0
+  %xor7 = xor i64 %xor6, %i0
+  %xor8 = xor i64 %xor7, %j0
+  %xor9 = xor i64 %xor8, %k0
+  %xor10 = xor i64 %xor9, %l0
+  %xor11 = xor i64 %xor10, %m0
+  %xor12 = xor i64 %xor11, %n0
+  %xor13 = xor i64 %xor12, %o0
+  %xor14 = xor i64 %xor13, %p0
+  %xor15 = xor i64 %xor14, 1
+  %xor16 = xor i64 %xor15, 2
+  ret i64 %xor16
+}
+
+define noundef i64 @inlining_call(i64 noundef %a0, i64 noundef %b0, i64 noundef %c0, i64 noundef %d0, i64 noundef %e0, i64 noundef %f0, i64 noundef %g0, i64 noundef %h0, i64 noundef %i0, i64 noundef %j0, i64 noundef %k0, i64 noundef %l0, i64 noundef %m0, i64 noundef %n0, i64 noundef %o0, i64 noundef %p0, i64 noundef %q0) {
+entry:
+  %xor = xor i64 %a0, %b0
+  %xor1 = xor i64 %xor, %c0
+  %xor2 = xor i64 %xor1, %d0
+  %xor3 = xor i64 %xor2, %e0
+  %xor4 = xor i64 %xor3, %f0
+  %xor5 = xor i64 %xor4, %g0
+  %xor6 = xor i64 %xor5, %h0
+  %xor7 = xor i64 %xor6, %i0
+  %xor8 = xor i64 %xor7, %j0
+  %xor9 = xor i64 %xor8, %k0
+  %xor10 = xor i64 %xor9, %l0
+  %xor11 = xor i64 %xor10, %m0
+  %xor12 = xor i64 %xor11, %n0
+  %xor13 = xor i64 %xor12, %o0
+  %xor14 = xor i64 %xor13, %p0
+  %xor15 = xor i64 %xor14, %q0
+  %xor16 = xor i64 %xor15, 1
+  %xor17 = xor i64 %xor16, 1
+  ret i64 %xor17
+}
+
+; Calling each (non-)inlining function twice to make sure they won't get the sole call inlining cost bonus. 
+define i64 @Caller(ptr noundef %in) {
+entry:
+  %arrayidx = getelementptr inbounds i64, ptr %in, i64 0
+  %a0 = load i64, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i64, ptr %in, i64 1
+  %b0 = load i64, ptr %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i64, ptr %in, i64 2
+  %c0 = load i64, ptr %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i64, ptr %in, i64 3
+  %d0 = load i64, ptr %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds i64, ptr %in, i64 4
+  %e0 = load i64, ptr %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i64, ptr %in, i64 5
+  %f0 = load i64, ptr %arrayidx5, align 4
+  %arrayidx6 = getelementptr inbounds i64, ptr %in, i64 6
+  %g0 = load i64, ptr %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds i64, ptr %in, i64 7
+  %h0 = load i64, ptr %arrayidx7, align 4
+  %arrayidx8 = getelementptr inbounds i64, ptr %in, i64 8
+  %i0 = load i64, ptr %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i64, ptr %in, i64 9
+  %j0 = load i64, ptr %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i64, ptr %in, i64 10
+  %k0 = load i64, ptr %arrayidx10, align 4
+  %arrayidx11 = getelementptr inbounds i64, ptr %in, i64 11
+  %l0 = load i64, ptr %arrayidx11, align 4
+  %arrayidx12 = getelementptr inbounds i64, ptr %in, i64 12
+  %m0 = load i64, ptr %arrayidx12, align 4
+  %arrayidx13 = getelementptr inbounds i64, ptr %in, i64 13
+  %n0 = load i64, ptr %arrayidx13, align 4
+  %arrayidx14 = getelementptr inbounds i64, ptr %in, i64 14
+  %o0 = load i64, ptr %arrayidx14, align 4
+  %arrayidx15 = getelementptr inbounds i64, ptr %in, i64 15
+  %p0 = load i64, ptr %arrayidx15, align 4
+  %arrayidx16 = getelementptr inbounds i64, ptr %in, i64 16
+  %q0 = load i64, ptr %arrayidx16, align 4
+  %noinlinecall1 = call noundef i64 @non_inlining_call(i64 noundef %a0, i64 noundef %b0, i64 noundef %c0, i64 noundef %d0, i64 noundef %e0, i64 noundef %f0, i64 noundef %g0, i64 noundef %h0, i64 noundef %i0, i64 noundef %j0, i64 noundef %k0, i64 noundef %l0, i64 noundef %m0, i64 noundef %n0, i64 noundef %o0, i64 noundef %p0)
+  %add = add i64 0, %noinlinecall1
+  %noinlinecall2 = call noundef i64 @non_inlining_call(i64 noundef %a0, i64 noundef %b0, i64 noundef %c0, i64 noundef %d0, i64 noundef %e0, i64 noundef %f0, i64 noundef %g0, i64 noundef %h0, i64 noundef %i0, i64 noundef %j0, i64 noundef %k0, i64 noundef %l0, i64 noundef %m0, i64 noundef %n0, i64 noundef %o0, i64 noundef %p0)
+  %add2 = add i64 %add, %noinlinecall2
+  %inlinecall1 = call noundef i64 @inlining_call(i64 noundef %a0, i64 noundef %b0, i64 noundef %c0, i64 noundef %d0, i64 noundef %e0, i64 noundef %f0, i64 noundef %g0, i64 noundef %h0, i64 noundef %i0, i64 noundef %j0, i64 noundef %k0, i64 noundef %l0, i64 noundef %m0, i64 noundef %n0, i64 noundef %o0, i64 noundef %p0, i64 noundef %q0)
+  %add3 = add i64 %add2, %inlinecall1
+  %inlinecall2 = call noundef i64 @inlining_call(i64 noundef %a0, i64 noundef %b0, i64 noundef %c0, i64 noundef %d0, i64 noundef %e0, i64 noundef %f0, i64 noundef %g0, i64 noundef %h0, i64 noundef %i0, i64 noundef %j0, i64 noundef %k0, i64 noundef %l0, i64 noundef %m0, i64 noundef %n0, i64 noundef %o0, i64 noundef %p0, i64 noundef %q0)
+  %add4 = add i64 %add3, %inlinecall2
+  ret i64 %add4
+}

diff  --git a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-argument.ll b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-argument.ll
new file mode 100644
index 0000000000000..3f5af81e9fddd
--- /dev/null
+++ b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-argument.ll
@@ -0,0 +1,164 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=inline -inline-cost-full=true -inline-threshold=0 -inline-instr-cost=5 -inline-call-penalty=0 -debug-only=inline < %s 2>&1 | FileCheck %s
+
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall1 = call noundef i32 @non_inlining_call
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall2 = call noundef i32 @non_inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall1 = call noundef i32 @inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall2 = call noundef i32 @inlining_call
+
+define noundef i32 @non_inlining_call(i32 noundef %a0, i32 noundef %b0, i32 noundef %c0, i32 noundef %d0, i32 noundef %e0, i32 noundef %f0, i32 noundef %g0, i32 noundef %h0, i32 noundef %i0, i32 noundef %j0, i32 noundef %k0, i32 noundef %l0, i32 noundef %m0, i32 noundef %n0, i32 noundef %o0, i32 noundef %p0, i32 noundef %q0, i32 noundef %r0, i32 noundef %s0, i32 noundef %t0, i32 noundef %u0, i32 noundef %v0, i32 noundef %w0, i32 noundef %x0, i32 noundef %y0, i32 noundef %z0, i32 noundef %a1, i32 noundef %b1, i32 noundef %c1, i32 noundef %d1, i32 noundef %e1, i32 noundef %f1) {
+entry:
+  %xor = xor i32 %a0, %b0
+  %xor1 = xor i32 %xor, %c0
+  %xor2 = xor i32 %xor1, %d0
+  %xor3 = xor i32 %xor2, %e0
+  %xor4 = xor i32 %xor3, %f0
+  %xor5 = xor i32 %xor4, %g0
+  %xor6 = xor i32 %xor5, %h0
+  %xor7 = xor i32 %xor6, %i0
+  %xor8 = xor i32 %xor7, %j0
+  %xor9 = xor i32 %xor8, %k0
+  %xor10 = xor i32 %xor9, %l0
+  %xor11 = xor i32 %xor10, %m0
+  %xor12 = xor i32 %xor11, %n0
+  %xor13 = xor i32 %xor12, %o0
+  %xor14 = xor i32 %xor13, %p0
+  %xor15 = xor i32 %xor14, %q0
+  %xor16 = xor i32 %xor15, %r0
+  %xor17 = xor i32 %xor16, %s0
+  %xor18 = xor i32 %xor17, %t0
+  %xor19 = xor i32 %xor18, %u0
+  %xor20 = xor i32 %xor19, %v0
+  %xor21 = xor i32 %xor20, %w0
+  %xor22 = xor i32 %xor21, %x0
+  %xor23 = xor i32 %xor22, %y0
+  %xor24 = xor i32 %xor23, %z0
+  %xor25 = xor i32 %xor24, %a1
+  %xor26 = xor i32 %xor25, %b1
+  %xor27 = xor i32 %xor26, %c1
+  %xor28 = xor i32 %xor27, %d1
+  %xor29 = xor i32 %xor28, %e1
+  %xor30 = xor i32 %xor29, %f1
+  %xor31 = xor i32 %xor30, 1
+  %xor32 = xor i32 %xor31, 2
+  ret i32 %xor32
+}
+
+define noundef i32 @inlining_call(i32 noundef %a0, i32 noundef %b0, i32 noundef %c0, i32 noundef %d0, i32 noundef %e0, i32 noundef %f0, i32 noundef %g0, i32 noundef %h0, i32 noundef %i0, i32 noundef %j0, i32 noundef %k0, i32 noundef %l0, i32 noundef %m0, i32 noundef %n0, i32 noundef %o0, i32 noundef %p0, i32 noundef %q0, i32 noundef %r0, i32 noundef %s0, i32 noundef %t0, i32 noundef %u0, i32 noundef %v0, i32 noundef %w0, i32 noundef %x0, i32 noundef %y0, i32 noundef %z0, i32 noundef %a1, i32 noundef %b1, i32 noundef %c1, i32 noundef %d1, i32 noundef %e1, i32 noundef %f1, i32 noundef %g1) {
+entry:
+  %xor = xor i32 %a0, %b0
+  %xor1 = xor i32 %xor, %c0
+  %xor2 = xor i32 %xor1, %d0
+  %xor3 = xor i32 %xor2, %e0
+  %xor4 = xor i32 %xor3, %f0
+  %xor5 = xor i32 %xor4, %g0
+  %xor6 = xor i32 %xor5, %h0
+  %xor7 = xor i32 %xor6, %i0
+  %xor8 = xor i32 %xor7, %j0
+  %xor9 = xor i32 %xor8, %k0
+  %xor10 = xor i32 %xor9, %l0
+  %xor11 = xor i32 %xor10, %m0
+  %xor12 = xor i32 %xor11, %n0
+  %xor13 = xor i32 %xor12, %o0
+  %xor14 = xor i32 %xor13, %p0
+  %xor15 = xor i32 %xor14, %q0
+  %xor16 = xor i32 %xor15, %r0
+  %xor17 = xor i32 %xor16, %s0
+  %xor18 = xor i32 %xor17, %t0
+  %xor19 = xor i32 %xor18, %u0
+  %xor20 = xor i32 %xor19, %v0
+  %xor21 = xor i32 %xor20, %w0
+  %xor22 = xor i32 %xor21, %x0
+  %xor23 = xor i32 %xor22, %y0
+  %xor24 = xor i32 %xor23, %z0
+  %xor25 = xor i32 %xor24, %a1
+  %xor26 = xor i32 %xor25, %b1
+  %xor27 = xor i32 %xor26, %c1
+  %xor28 = xor i32 %xor27, %d1
+  %xor29 = xor i32 %xor28, %e1
+  %xor30 = xor i32 %xor29, %f1
+  %xor31 = xor i32 %xor30, %g1
+  %xor32 = xor i32 %xor30, 1
+  %xor33 = xor i32 %xor31, 2
+  ret i32 %xor33
+}
+
+; Calling each (non-)inlining function twice to make sure they won't get the sole call inlining cost bonus. 
+define i32 @Caller(ptr noundef %in) {
+entry:
+  %arrayidx = getelementptr inbounds i32, ptr %in, i64 0
+  %a0 = load i32, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, ptr %in, i64 1
+  %b0 = load i32, ptr %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %in, i64 2
+  %c0 = load i32, ptr %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, ptr %in, i64 3
+  %d0 = load i32, ptr %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds i32, ptr %in, i64 4
+  %e0 = load i32, ptr %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32, ptr %in, i64 5
+  %f0 = load i32, ptr %arrayidx5, align 4
+  %arrayidx6 = getelementptr inbounds i32, ptr %in, i64 6
+  %g0 = load i32, ptr %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds i32, ptr %in, i64 7
+  %h0 = load i32, ptr %arrayidx7, align 4
+  %arrayidx8 = getelementptr inbounds i32, ptr %in, i64 8
+  %i0 = load i32, ptr %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32, ptr %in, i64 9
+  %j0 = load i32, ptr %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i32, ptr %in, i64 10
+  %k0 = load i32, ptr %arrayidx10, align 4
+  %arrayidx11 = getelementptr inbounds i32, ptr %in, i64 11
+  %l0 = load i32, ptr %arrayidx11, align 4
+  %arrayidx12 = getelementptr inbounds i32, ptr %in, i64 12
+  %m0 = load i32, ptr %arrayidx12, align 4
+  %arrayidx13 = getelementptr inbounds i32, ptr %in, i64 13
+  %n0 = load i32, ptr %arrayidx13, align 4
+  %arrayidx14 = getelementptr inbounds i32, ptr %in, i64 14
+  %o0 = load i32, ptr %arrayidx14, align 4
+  %arrayidx15 = getelementptr inbounds i32, ptr %in, i64 15
+  %p0 = load i32, ptr %arrayidx15, align 4
+  %arrayidx16 = getelementptr inbounds i32, ptr %in, i64 16
+  %q0 = load i32, ptr %arrayidx16, align 4
+  %arrayidx17 = getelementptr inbounds i32, ptr %in, i64 17
+  %r0 = load i32, ptr %arrayidx17, align 4
+  %arrayidx18 = getelementptr inbounds i32, ptr %in, i64 18
+  %s0 = load i32, ptr %arrayidx18, align 4
+  %arrayidx19 = getelementptr inbounds i32, ptr %in, i64 19
+  %t0 = load i32, ptr %arrayidx19, align 4
+  %arrayidx20 = getelementptr inbounds i32, ptr %in, i64 20
+  %u0 = load i32, ptr %arrayidx20, align 4
+  %arrayidx21 = getelementptr inbounds i32, ptr %in, i64 21
+  %v0 = load i32, ptr %arrayidx21, align 4
+  %arrayidx22 = getelementptr inbounds i32, ptr %in, i64 22
+  %w0 = load i32, ptr %arrayidx22, align 4
+  %arrayidx23 = getelementptr inbounds i32, ptr %in, i64 23
+  %x0 = load i32, ptr %arrayidx23, align 4
+  %arrayidx24 = getelementptr inbounds i32, ptr %in, i64 24
+  %y0 = load i32, ptr %arrayidx24, align 4
+  %arrayidx25 = getelementptr inbounds i32, ptr %in, i64 25
+  %z0 = load i32, ptr %arrayidx25, align 4
+  %arrayidx26 = getelementptr inbounds i32, ptr %in, i64 26
+  %a1 = load i32, ptr %arrayidx26, align 4
+  %arrayidx27 = getelementptr inbounds i32, ptr %in, i64 27
+  %b1 = load i32, ptr %arrayidx27, align 4
+  %arrayidx28 = getelementptr inbounds i32, ptr %in, i64 28
+  %c1 = load i32, ptr %arrayidx28, align 4
+  %arrayidx29 = getelementptr inbounds i32, ptr %in, i64 29
+  %d1 = load i32, ptr %arrayidx29, align 4
+  %arrayidx30 = getelementptr inbounds i32, ptr %in, i64 30
+  %e1 = load i32, ptr %arrayidx30, align 4
+  %arrayidx31 = getelementptr inbounds i32, ptr %in, i64 31
+  %f1 = load i32, ptr %arrayidx31, align 4
+  %arrayidx32 = getelementptr inbounds i32, ptr %in, i64 32
+  %g1 = load i32, ptr %arrayidx32, align 4
+  %noinlinecall1 = call noundef i32 @non_inlining_call(i32 noundef %a0, i32 noundef %b0, i32 noundef %c0, i32 noundef %d0, i32 noundef %e0, i32 noundef %f0, i32 noundef %g0, i32 noundef %h0, i32 noundef %i0, i32 noundef %j0, i32 noundef %k0, i32 noundef %l0, i32 noundef %m0, i32 noundef %n0, i32 noundef %o0, i32 noundef %p0, i32 noundef %q0, i32 noundef %r0, i32 noundef %s0, i32 noundef %t0, i32 noundef %u0, i32 noundef %v0, i32 noundef %w0, i32 noundef %x0, i32 noundef %y0, i32 noundef %z0, i32 noundef %a1, i32 noundef %b1, i32 noundef %c1, i32 noundef %d1, i32 noundef %e1, i32 noundef %f1)
+  %add = add i32 0, %noinlinecall1
+  %noinlinecall2 = call noundef i32 @non_inlining_call(i32 noundef %a0, i32 noundef %b0, i32 noundef %c0, i32 noundef %d0, i32 noundef %e0, i32 noundef %f0, i32 noundef %g0, i32 noundef %h0, i32 noundef %i0, i32 noundef %j0, i32 noundef %k0, i32 noundef %l0, i32 noundef %m0, i32 noundef %n0, i32 noundef %o0, i32 noundef %p0, i32 noundef %q0, i32 noundef %r0, i32 noundef %s0, i32 noundef %t0, i32 noundef %u0, i32 noundef %v0, i32 noundef %w0, i32 noundef %x0, i32 noundef %y0, i32 noundef %z0, i32 noundef %a1, i32 noundef %b1, i32 noundef %c1, i32 noundef %d1, i32 noundef %e1, i32 noundef %f1)
+  %add2 = add i32 %add, %noinlinecall2
+  %inlinecall1 = call noundef i32 @inlining_call(i32 noundef %a0, i32 noundef %b0, i32 noundef %c0, i32 noundef %d0, i32 noundef %e0, i32 noundef %f0, i32 noundef %g0, i32 noundef %h0, i32 noundef %i0, i32 noundef %j0, i32 noundef %k0, i32 noundef %l0, i32 noundef %m0, i32 noundef %n0, i32 noundef %o0, i32 noundef %p0, i32 noundef %q0, i32 noundef %r0, i32 noundef %s0, i32 noundef %t0, i32 noundef %u0, i32 noundef %v0, i32 noundef %w0, i32 noundef %x0, i32 noundef %y0, i32 noundef %z0, i32 noundef %a1, i32 noundef %b1, i32 noundef %c1, i32 noundef %d1, i32 noundef %e1, i32 noundef %f1, i32 noundef %g1)
+  %add3 = add i32 %add2, %inlinecall1
+  %inlinecall2 = call noundef i32 @inlining_call(i32 noundef %a0, i32 noundef %b0, i32 noundef %c0, i32 noundef %d0, i32 noundef %e0, i32 noundef %f0, i32 noundef %g0, i32 noundef %h0, i32 noundef %i0, i32 noundef %j0, i32 noundef %k0, i32 noundef %l0, i32 noundef %m0, i32 noundef %n0, i32 noundef %o0, i32 noundef %p0, i32 noundef %q0, i32 noundef %r0, i32 noundef %s0, i32 noundef %t0, i32 noundef %u0, i32 noundef %v0, i32 noundef %w0, i32 noundef %x0, i32 noundef %y0, i32 noundef %z0, i32 noundef %a1, i32 noundef %b1, i32 noundef %c1, i32 noundef %d1, i32 noundef %e1, i32 noundef %f1, i32 noundef %g1)
+  %add4 = add i32 %add3, %inlinecall2
+  ret i32 %add4
+}

diff  --git a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-array-ptr-argument.ll b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-array-ptr-argument.ll
new file mode 100644
index 0000000000000..61a8ab6056c49
--- /dev/null
+++ b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-array-ptr-argument.ll
@@ -0,0 +1,118 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=inline -inline-cost-full=true -inline-threshold=0 -inline-instr-cost=5 -inline-call-penalty=0 -debug-only=inline < %s 2>&1 | FileCheck %s
+
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall1 = call noundef i64 @non_inlining_call
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall2 = call noundef i64 @non_inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall1 = call noundef i64 @inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall2 = call noundef i64 @inlining_call
+
+define noundef i64 @non_inlining_call([2 x ptr] noundef %ptrarr, ptr noundef %ptrc0, ptr noundef %ptrd0, ptr noundef %ptre0, ptr noundef %ptrf0, ptr noundef %ptrg0, ptr noundef %ptrh0, ptr noundef %ptri0, ptr noundef %ptrj0, ptr noundef %ptrk0, ptr noundef %ptrl0, ptr noundef %ptrm0, ptr noundef %ptrn0, ptr noundef %ptro0, ptr noundef %ptrp0) {
+entry:
+  %ptra0 = extractvalue [2 x ptr] %ptrarr, 0
+  %ptrb0 = extractvalue [2 x ptr] %ptrarr, 1
+  %a0 = load i64, ptr %ptra0, align 8
+  %b0 = load i64, ptr %ptrb0, align 8
+  %c0 = load i64, ptr %ptrc0, align 8
+  %d0 = load i64, ptr %ptrd0, align 8
+  %e0 = load i64, ptr %ptre0, align 8
+  %f0 = load i64, ptr %ptrf0, align 8
+  %g0 = load i64, ptr %ptrg0, align 8
+  %h0 = load i64, ptr %ptrh0, align 8
+  %i0 = load i64, ptr %ptri0, align 8
+  %j0 = load i64, ptr %ptrj0, align 8
+  %k0 = load i64, ptr %ptrk0, align 8
+  %l0 = load i64, ptr %ptrl0, align 8
+  %m0 = load i64, ptr %ptrm0, align 8
+  %n0 = load i64, ptr %ptrn0, align 8
+  %o0 = load i64, ptr %ptro0, align 8
+  %p0 = load i64, ptr %ptrp0, align 8
+  %xor = xor i64 %a0, %b0
+  %xor1 = xor i64 %xor, %c0
+  %xor2 = xor i64 %xor1, %d0
+  %xor3 = xor i64 %xor2, %e0
+  %xor4 = xor i64 %xor3, %f0
+  %xor5 = xor i64 %xor4, %g0
+  %xor6 = xor i64 %xor5, %h0
+  %xor7 = xor i64 %xor6, %i0
+  %xor8 = xor i64 %xor7, %j0
+  %xor9 = xor i64 %xor8, %k0
+  %xor10 = xor i64 %xor9, %l0
+  %xor11 = xor i64 %xor10, %m0
+  %xor12 = xor i64 %xor11, %n0
+  %xor13 = xor i64 %xor12, %o0
+  %xor14 = xor i64 %xor13, %p0
+  ret i64 %xor14
+}
+
+define noundef i64 @inlining_call([2 x ptr] noundef %ptrarr, ptr noundef %ptrc0, ptr noundef %ptrd0, ptr noundef %ptre0, ptr noundef %ptrf0, ptr noundef %ptrg0, ptr noundef %ptrh0, ptr noundef %ptri0, ptr noundef %ptrj0, ptr noundef %ptrk0, ptr noundef %ptrl0, ptr noundef %ptrm0, ptr noundef %ptrn0, ptr noundef %ptro0, ptr noundef %ptrp0, ptr noundef %ptrq0) {
+entry:
+  %ptra0 = extractvalue [2 x ptr] %ptrarr, 0
+  %ptrb0 = extractvalue [2 x ptr] %ptrarr, 1
+  %a0 = load i64, ptr %ptra0, align 8
+  %b0 = load i64, ptr %ptrb0, align 8
+  %c0 = load i64, ptr %ptrc0, align 8
+  %d0 = load i64, ptr %ptrd0, align 8
+  %e0 = load i64, ptr %ptre0, align 8
+  %f0 = load i64, ptr %ptrf0, align 8
+  %g0 = load i64, ptr %ptrg0, align 8
+  %h0 = load i64, ptr %ptrh0, align 8
+  %i0 = load i64, ptr %ptri0, align 8
+  %j0 = load i64, ptr %ptrj0, align 8
+  %k0 = load i64, ptr %ptrk0, align 8
+  %l0 = load i64, ptr %ptrl0, align 8
+  %m0 = load i64, ptr %ptrm0, align 8
+  %n0 = load i64, ptr %ptrn0, align 8
+  %o0 = load i64, ptr %ptro0, align 8
+  %p0 = load i64, ptr %ptrp0, align 8
+  %q0 = load i64, ptr %ptrq0, align 8
+  %xor = xor i64 %a0, %b0
+  %xor1 = xor i64 %xor, %c0
+  %xor2 = xor i64 %xor1, %d0
+  %xor3 = xor i64 %xor2, %e0
+  %xor4 = xor i64 %xor3, %f0
+  %xor5 = xor i64 %xor4, %g0
+  %xor6 = xor i64 %xor5, %h0
+  %xor7 = xor i64 %xor6, %i0
+  %xor8 = xor i64 %xor7, %j0
+  %xor9 = xor i64 %xor8, %k0
+  %xor10 = xor i64 %xor9, %l0
+  %xor11 = xor i64 %xor10, %m0
+  %xor12 = xor i64 %xor11, %n0
+  %xor13 = xor i64 %xor12, %o0
+  %xor14 = xor i64 %xor13, %p0
+  %xor15 = xor i64 %xor14, %q0
+  ret i64 %xor15
+}
+
+; Calling each (non-)inlining function twice to make sure they won't get the sole call inlining cost bonus. 
+define i64 @Caller(ptr noundef %in) {
+entry:
+  %a0 = getelementptr inbounds i64, ptr %in, i64 0
+  %b0 = getelementptr inbounds i64, ptr %in, i64 1
+  %arr0 = insertvalue [2 x ptr] undef, ptr %a0, 0
+  %arr1 = insertvalue [2 x ptr] %arr0, ptr %b0, 1
+  %c0 = getelementptr inbounds i64, ptr %in, i64 2
+  %d0 = getelementptr inbounds i64, ptr %in, i64 3
+  %e0 = getelementptr inbounds i64, ptr %in, i64 4
+  %f0 = getelementptr inbounds i64, ptr %in, i64 5
+  %g0 = getelementptr inbounds i64, ptr %in, i64 6
+  %h0 = getelementptr inbounds i64, ptr %in, i64 7
+  %i0 = getelementptr inbounds i64, ptr %in, i64 8
+  %j0 = getelementptr inbounds i64, ptr %in, i64 9
+  %k0 = getelementptr inbounds i64, ptr %in, i64 10
+  %l0 = getelementptr inbounds i64, ptr %in, i64 11
+  %m0 = getelementptr inbounds i64, ptr %in, i64 12
+  %n0 = getelementptr inbounds i64, ptr %in, i64 13
+  %o0 = getelementptr inbounds i64, ptr %in, i64 14
+  %p0 = getelementptr inbounds i64, ptr %in, i64 15
+  %q0 = getelementptr inbounds i64, ptr %in, i64 16
+  %noinlinecall1 = call noundef i64 @non_inlining_call([2 x ptr] noundef %arr1, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0)
+  %add = add i64 0, %noinlinecall1
+  %noinlinecall2 = call noundef i64 @non_inlining_call([2 x ptr] noundef %arr1, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0)
+  %add2 = add i64 %add, %noinlinecall2
+  %inlinecall1 = call noundef i64 @inlining_call([2 x ptr] noundef %arr1, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0, ptr noundef %q0)
+  %add3 = add i64 %add2, %inlinecall1
+  %inlinecall2 = call noundef i64 @inlining_call([2 x ptr] noundef %arr1, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0, ptr noundef %q0)
+  %add4 = add i64 %add3, %inlinecall2
+  ret i64 %add4
+}

diff  --git a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-indirect-call-argument.ll b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-indirect-call-argument.ll
new file mode 100644
index 0000000000000..8a44f7d3b247d
--- /dev/null
+++ b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-indirect-call-argument.ll
@@ -0,0 +1,21 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=inline -inline-cost-full=true -inline-threshold=0 -inline-instr-cost=5 -inline-call-penalty=0 -debug-only=inline < %s 2>&1 | FileCheck %s
+
+; CHECK:      Inlining (cost={{-+[0-9]+}}, threshold=330), Call:   call void @Dummy
+
+define void @Wrapper(ptr nocapture nofree noundef readonly %func, i64 noundef %a0, i64 noundef %b0, i64 noundef %c0, i64 noundef %d0, i64 noundef %e0, i64 noundef %f0, i64 noundef %g0, i64 noundef %h0, i64 noundef %i0, i64 noundef %j0, i64 noundef %k0, i64 noundef %l0, i64 noundef %m0, i64 noundef %n0, i64 noundef %o0, i64 noundef %p0, i64 noundef %q0) {
+entry:
+  call void %func(i64 noundef %a0, i64 noundef %b0, i64 noundef %c0, i64 noundef %d0, i64 noundef %e0, i64 noundef %f0, i64 noundef %g0, i64 noundef %h0, i64 noundef %i0, i64 noundef %j0, i64 noundef %k0, i64 noundef %l0, i64 noundef %m0, i64 noundef %n0, i64 noundef %o0, i64 noundef %p0, i64 noundef %q0)
+  ret void
+}
+
+define internal void @Dummy(i64 noundef %a0, i64 noundef %b0, i64 noundef %c0, i64 noundef %d0, i64 noundef %e0, i64 noundef %f0, i64 noundef %g0, i64 noundef %h0, i64 noundef %i0, i64 noundef %j0, i64 noundef %k0, i64 noundef %l0, i64 noundef %m0, i64 noundef %n0, i64 noundef %o0, i64 noundef %p0, i64 noundef %q0) {
+entry:
+  ret void
+}
+
+define void @Caller(i64 noundef %a0, i64 noundef %b0, i64 noundef %c0, i64 noundef %d0, i64 noundef %e0, i64 noundef %f0, i64 noundef %g0, i64 noundef %h0, i64 noundef %i0, i64 noundef %j0, i64 noundef %k0, i64 noundef %l0, i64 noundef %m0, i64 noundef %n0, i64 noundef %o0, i64 noundef %p0, i64 noundef %q0) minsize {
+entry:
+  call void @Wrapper(ptr noundef @Dummy, i64 noundef %a0, i64 noundef %b0, i64 noundef %c0, i64 noundef %d0, i64 noundef %e0, i64 noundef %f0, i64 noundef %g0, i64 noundef %h0, i64 noundef %i0, i64 noundef %j0, i64 noundef %k0, i64 noundef %l0, i64 noundef %m0, i64 noundef %n0, i64 noundef %o0, i64 noundef %p0, i64 noundef %q0)
+  ret void
+}

diff  --git a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-ptr-argument.ll b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-ptr-argument.ll
new file mode 100644
index 0000000000000..e03fc1cbd4573
--- /dev/null
+++ b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-ptr-argument.ll
@@ -0,0 +1,112 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=inline -inline-cost-full=true -inline-threshold=0 -inline-instr-cost=5 -inline-call-penalty=0 -debug-only=inline < %s 2>&1 | FileCheck %s
+
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall1 = call noundef i64 @non_inlining_call
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall2 = call noundef i64 @non_inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall1 = call noundef i64 @inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall2 = call noundef i64 @inlining_call
+
+define noundef i64 @non_inlining_call(ptr noundef %ptra0, ptr noundef %ptrb0, ptr noundef %ptrc0, ptr noundef %ptrd0, ptr noundef %ptre0, ptr noundef %ptrf0, ptr noundef %ptrg0, ptr noundef %ptrh0, ptr noundef %ptri0, ptr noundef %ptrj0, ptr noundef %ptrk0, ptr noundef %ptrl0, ptr noundef %ptrm0, ptr noundef %ptrn0, ptr noundef %ptro0, ptr noundef %ptrp0) {
+entry:
+  %a0 = load i64, ptr %ptra0, align 8
+  %b0 = load i64, ptr %ptrb0, align 8
+  %c0 = load i64, ptr %ptrc0, align 8
+  %d0 = load i64, ptr %ptrd0, align 8
+  %e0 = load i64, ptr %ptre0, align 8
+  %f0 = load i64, ptr %ptrf0, align 8
+  %g0 = load i64, ptr %ptrg0, align 8
+  %h0 = load i64, ptr %ptrh0, align 8
+  %i0 = load i64, ptr %ptri0, align 8
+  %j0 = load i64, ptr %ptrj0, align 8
+  %k0 = load i64, ptr %ptrk0, align 8
+  %l0 = load i64, ptr %ptrl0, align 8
+  %m0 = load i64, ptr %ptrm0, align 8
+  %n0 = load i64, ptr %ptrn0, align 8
+  %o0 = load i64, ptr %ptro0, align 8
+  %p0 = load i64, ptr %ptrp0, align 8
+  %xor = xor i64 %a0, %b0
+  %xor1 = xor i64 %xor, %c0
+  %xor2 = xor i64 %xor1, %d0
+  %xor3 = xor i64 %xor2, %e0
+  %xor4 = xor i64 %xor3, %f0
+  %xor5 = xor i64 %xor4, %g0
+  %xor6 = xor i64 %xor5, %h0
+  %xor7 = xor i64 %xor6, %i0
+  %xor8 = xor i64 %xor7, %j0
+  %xor9 = xor i64 %xor8, %k0
+  %xor10 = xor i64 %xor9, %l0
+  %xor11 = xor i64 %xor10, %m0
+  %xor12 = xor i64 %xor11, %n0
+  %xor13 = xor i64 %xor12, %o0
+  %xor14 = xor i64 %xor13, %p0
+  ret i64 %xor14
+}
+
+define noundef i64 @inlining_call(ptr noundef %ptra0, ptr noundef %ptrb0, ptr noundef %ptrc0, ptr noundef %ptrd0, ptr noundef %ptre0, ptr noundef %ptrf0, ptr noundef %ptrg0, ptr noundef %ptrh0, ptr noundef %ptri0, ptr noundef %ptrj0, ptr noundef %ptrk0, ptr noundef %ptrl0, ptr noundef %ptrm0, ptr noundef %ptrn0, ptr noundef %ptro0, ptr noundef %ptrp0, ptr noundef %ptrq0) {
+entry:
+  %a0 = load i64, ptr %ptra0, align 8
+  %b0 = load i64, ptr %ptrb0, align 8
+  %c0 = load i64, ptr %ptrc0, align 8
+  %d0 = load i64, ptr %ptrd0, align 8
+  %e0 = load i64, ptr %ptre0, align 8
+  %f0 = load i64, ptr %ptrf0, align 8
+  %g0 = load i64, ptr %ptrg0, align 8
+  %h0 = load i64, ptr %ptrh0, align 8
+  %i0 = load i64, ptr %ptri0, align 8
+  %j0 = load i64, ptr %ptrj0, align 8
+  %k0 = load i64, ptr %ptrk0, align 8
+  %l0 = load i64, ptr %ptrl0, align 8
+  %m0 = load i64, ptr %ptrm0, align 8
+  %n0 = load i64, ptr %ptrn0, align 8
+  %o0 = load i64, ptr %ptro0, align 8
+  %p0 = load i64, ptr %ptrp0, align 8
+  %q0 = load i64, ptr %ptrq0, align 8
+  %xor = xor i64 %a0, %b0
+  %xor1 = xor i64 %xor, %c0
+  %xor2 = xor i64 %xor1, %d0
+  %xor3 = xor i64 %xor2, %e0
+  %xor4 = xor i64 %xor3, %f0
+  %xor5 = xor i64 %xor4, %g0
+  %xor6 = xor i64 %xor5, %h0
+  %xor7 = xor i64 %xor6, %i0
+  %xor8 = xor i64 %xor7, %j0
+  %xor9 = xor i64 %xor8, %k0
+  %xor10 = xor i64 %xor9, %l0
+  %xor11 = xor i64 %xor10, %m0
+  %xor12 = xor i64 %xor11, %n0
+  %xor13 = xor i64 %xor12, %o0
+  %xor14 = xor i64 %xor13, %p0
+  %xor15 = xor i64 %xor14, %q0
+  ret i64 %xor15
+}
+
+; Calling each (non-)inlining function twice to make sure they won't get the sole call inlining cost bonus. 
+define i64 @Caller(ptr noundef %in) {
+entry:
+  %a0 = getelementptr inbounds i64, ptr %in, i64 0
+  %b0 = getelementptr inbounds i64, ptr %in, i64 1
+  %c0 = getelementptr inbounds i64, ptr %in, i64 2
+  %d0 = getelementptr inbounds i64, ptr %in, i64 3
+  %e0 = getelementptr inbounds i64, ptr %in, i64 4
+  %f0 = getelementptr inbounds i64, ptr %in, i64 5
+  %g0 = getelementptr inbounds i64, ptr %in, i64 6
+  %h0 = getelementptr inbounds i64, ptr %in, i64 7
+  %i0 = getelementptr inbounds i64, ptr %in, i64 8
+  %j0 = getelementptr inbounds i64, ptr %in, i64 9
+  %k0 = getelementptr inbounds i64, ptr %in, i64 10
+  %l0 = getelementptr inbounds i64, ptr %in, i64 11
+  %m0 = getelementptr inbounds i64, ptr %in, i64 12
+  %n0 = getelementptr inbounds i64, ptr %in, i64 13
+  %o0 = getelementptr inbounds i64, ptr %in, i64 14
+  %p0 = getelementptr inbounds i64, ptr %in, i64 15
+  %q0 = getelementptr inbounds i64, ptr %in, i64 16
+  %noinlinecall1 = call noundef i64 @non_inlining_call(ptr noundef %a0, ptr noundef %b0, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0)
+  %add = add i64 0, %noinlinecall1
+  %noinlinecall2 = call noundef i64 @non_inlining_call(ptr noundef %a0, ptr noundef %b0, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0)
+  %add2 = add i64 %add, %noinlinecall2
+  %inlinecall1 = call noundef i64 @inlining_call(ptr noundef %a0, ptr noundef %b0, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0, ptr noundef %q0)
+  %add3 = add i64 %add2, %inlinecall1
+  %inlinecall2 = call noundef i64 @inlining_call(ptr noundef %a0, ptr noundef %b0, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0, ptr noundef %q0)
+  %add4 = add i64 %add3, %inlinecall2
+  ret i64 %add4
+}

diff  --git a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-struct-argument.ll b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-struct-argument.ll
new file mode 100644
index 0000000000000..91c9ee1ad3c09
--- /dev/null
+++ b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-struct-argument.ll
@@ -0,0 +1,171 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=inline -inline-cost-full=true -inline-threshold=0 -inline-instr-cost=5 -inline-call-penalty=0 -debug-only=inline < %s 2>&1 | FileCheck %s
+
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall1 = call noundef i64 @non_inlining_call
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall2 = call noundef i64 @non_inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall1 = call noundef i64 @inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall2 = call noundef i64 @inlining_call
+
+%noinlineT =  type {{ptr, ptr}, ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64}
+%inlineT =    type {{ptr, ptr}, ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64}
+
+define noundef i64 @non_inlining_call(%noinlineT noundef %struc) {
+entry:
+  %ptra0 = extractvalue %noinlineT %struc, 0, 0
+  %ptrb0 = extractvalue %noinlineT %struc, 0, 1
+  %ptrc0 = extractvalue %noinlineT %struc, 1
+  %a0 = load i64, ptr %ptra0, align 8
+  %b0 = load i64, ptr %ptrb0, align 8
+  %c0 = load i64, ptr %ptrc0, align 8
+  %d0 = extractvalue %noinlineT %struc, 2
+  %e0 = extractvalue %noinlineT %struc, 3
+  %f0 = extractvalue %noinlineT %struc, 4
+  %g0 = extractvalue %noinlineT %struc, 5
+  %h0 = extractvalue %noinlineT %struc, 6
+  %i0 = extractvalue %noinlineT %struc, 7
+  %j0 = extractvalue %noinlineT %struc, 8
+  %k0 = extractvalue %noinlineT %struc, 9
+  %l0 = extractvalue %noinlineT %struc, 10
+  %m0 = extractvalue %noinlineT %struc, 11
+  %n0 = extractvalue %noinlineT %struc, 12
+  %o0 = extractvalue %noinlineT %struc, 13
+  %p0 = extractvalue %noinlineT %struc, 14
+  %xor = xor i64 %a0, %b0
+  %xor1 = xor i64 %xor, %c0
+  %xor2 = xor i64 %xor1, %d0
+  %xor3 = xor i64 %xor2, %e0
+  %xor4 = xor i64 %xor3, %f0
+  %xor5 = xor i64 %xor4, %g0
+  %xor6 = xor i64 %xor5, %h0
+  %xor7 = xor i64 %xor6, %i0
+  %xor8 = xor i64 %xor7, %j0
+  %xor9 = xor i64 %xor8, %k0
+  %xor10 = xor i64 %xor9, %l0
+  %xor11 = xor i64 %xor10, %m0
+  %xor12 = xor i64 %xor11, %n0
+  %xor13 = xor i64 %xor12, %o0
+  %xor14 = xor i64 %xor13, %p0
+  ret i64 %xor14
+}
+
+define noundef i64 @inlining_call(%inlineT noundef %struc) {
+entry:
+  %ptra0 = extractvalue %inlineT %struc, 0, 0
+  %ptrb0 = extractvalue %inlineT %struc, 0, 1
+  %ptrc0 = extractvalue %inlineT %struc, 1
+  %a0 = load i64, ptr %ptra0, align 8
+  %b0 = load i64, ptr %ptrb0, align 8
+  %c0 = load i64, ptr %ptrc0, align 8
+  %d0 = extractvalue %inlineT %struc, 2
+  %e0 = extractvalue %inlineT %struc, 3
+  %f0 = extractvalue %inlineT %struc, 4
+  %g0 = extractvalue %inlineT %struc, 5
+  %h0 = extractvalue %inlineT %struc, 6
+  %i0 = extractvalue %inlineT %struc, 7
+  %j0 = extractvalue %inlineT %struc, 8
+  %k0 = extractvalue %inlineT %struc, 9
+  %l0 = extractvalue %inlineT %struc, 10
+  %m0 = extractvalue %inlineT %struc, 11
+  %n0 = extractvalue %inlineT %struc, 12
+  %o0 = extractvalue %inlineT %struc, 13
+  %p0 = extractvalue %inlineT %struc, 14
+  %q0 = extractvalue %inlineT %struc, 15
+  %xor = xor i64 %a0, %b0
+  %xor1 = xor i64 %xor, %c0
+  %xor2 = xor i64 %xor1, %d0
+  %xor3 = xor i64 %xor2, %e0
+  %xor4 = xor i64 %xor3, %f0
+  %xor5 = xor i64 %xor4, %g0
+  %xor6 = xor i64 %xor5, %h0
+  %xor7 = xor i64 %xor6, %i0
+  %xor8 = xor i64 %xor7, %j0
+  %xor9 = xor i64 %xor8, %k0
+  %xor10 = xor i64 %xor9, %l0
+  %xor11 = xor i64 %xor10, %m0
+  %xor12 = xor i64 %xor11, %n0
+  %xor13 = xor i64 %xor12, %o0
+  %xor14 = xor i64 %xor13, %p0
+  %xor15 = xor i64 %xor14, %q0
+  ret i64 %xor15
+}
+
+; Calling each (non-)inlining function twice to make sure they won't get the sole call inlining cost bonus. 
+define i64 @Caller(ptr noundef %in) {
+entry:
+  %ptra0 = getelementptr inbounds i64, ptr %in, i64 0
+  %ptrb0 = getelementptr inbounds i64, ptr %in, i64 1
+  %ptrc0 = getelementptr inbounds i64, ptr %in, i64 2
+  %ptrd0 = getelementptr inbounds i64, ptr %in, i64 3
+  %ptre0 = getelementptr inbounds i64, ptr %in, i64 4
+  %ptrf0 = getelementptr inbounds i64, ptr %in, i64 5
+  %ptrg0 = getelementptr inbounds i64, ptr %in, i64 6
+  %ptrh0 = getelementptr inbounds i64, ptr %in, i64 7
+  %ptri0 = getelementptr inbounds i64, ptr %in, i64 8
+  %ptrj0 = getelementptr inbounds i64, ptr %in, i64 9
+  %ptrk0 = getelementptr inbounds i64, ptr %in, i64 10
+  %ptrl0 = getelementptr inbounds i64, ptr %in, i64 11
+  %ptrm0 = getelementptr inbounds i64, ptr %in, i64 12
+  %ptrn0 = getelementptr inbounds i64, ptr %in, i64 13
+  %ptro0 = getelementptr inbounds i64, ptr %in, i64 14
+  %ptrp0 = getelementptr inbounds i64, ptr %in, i64 15
+  %ptrq0 = getelementptr inbounds i64, ptr %in, i64 16
+  %a0 = load i64, ptr %ptra0, align 8
+  %b0 = load i64, ptr %ptrb0, align 8
+  %c0 = load i64, ptr %ptrc0, align 8
+  %d0 = load i64, ptr %ptrd0, align 8
+  %e0 = load i64, ptr %ptre0, align 8
+  %f0 = load i64, ptr %ptrf0, align 8
+  %g0 = load i64, ptr %ptrg0, align 8
+  %h0 = load i64, ptr %ptrh0, align 8
+  %i0 = load i64, ptr %ptri0, align 8
+  %j0 = load i64, ptr %ptrj0, align 8
+  %k0 = load i64, ptr %ptrk0, align 8
+  %l0 = load i64, ptr %ptrl0, align 8
+  %m0 = load i64, ptr %ptrm0, align 8
+  %n0 = load i64, ptr %ptrn0, align 8
+  %o0 = load i64, ptr %ptro0, align 8
+  %p0 = load i64, ptr %ptrp0, align 8
+  %q0 = load i64, ptr %ptrq0, align 8
+  %noinlinestruc1 = insertvalue %noinlineT undef, ptr %ptra0, 0, 0
+  %noinlinestruc2 = insertvalue %noinlineT %noinlinestruc1, ptr %ptrb0, 0, 1
+  %noinlinestruc3 = insertvalue %noinlineT %noinlinestruc2, ptr %ptrc0, 1
+  %noinlinestruc4 = insertvalue %noinlineT %noinlinestruc3, i64 %d0, 2
+  %noinlinestruc5 = insertvalue %noinlineT %noinlinestruc4, i64 %e0, 3
+  %noinlinestruc6 = insertvalue %noinlineT %noinlinestruc5, i64 %f0, 4
+  %noinlinestruc7 = insertvalue %noinlineT %noinlinestruc6, i64 %g0, 5
+  %noinlinestruc8 = insertvalue %noinlineT %noinlinestruc7, i64 %h0, 6
+  %noinlinestruc9 = insertvalue %noinlineT %noinlinestruc8, i64 %i0, 7
+  %noinlinestruc10 = insertvalue %noinlineT %noinlinestruc9, i64 %j0, 8
+  %noinlinestruc11 = insertvalue %noinlineT %noinlinestruc10, i64 %k0, 9
+  %noinlinestruc12 = insertvalue %noinlineT %noinlinestruc11, i64 %l0, 10
+  %noinlinestruc13 = insertvalue %noinlineT %noinlinestruc12, i64 %m0, 11
+  %noinlinestruc14 = insertvalue %noinlineT %noinlinestruc13, i64 %n0, 12
+  %noinlinestruc15 = insertvalue %noinlineT %noinlinestruc14, i64 %o0, 13
+  %noinlinestruc16 = insertvalue %noinlineT %noinlinestruc15, i64 %p0, 14
+  %inlinestruc1 = insertvalue %inlineT undef, ptr %ptra0, 0, 0
+  %inlinestruc2 = insertvalue %inlineT %inlinestruc1, ptr %ptrb0, 0, 1
+  %inlinestruc3 = insertvalue %inlineT %inlinestruc2, ptr %ptrc0, 1
+  %inlinestruc4 = insertvalue %inlineT %inlinestruc3, i64 %d0, 2
+  %inlinestruc5 = insertvalue %inlineT %inlinestruc4, i64 %e0, 3
+  %inlinestruc6 = insertvalue %inlineT %inlinestruc5, i64 %f0, 4
+  %inlinestruc7 = insertvalue %inlineT %inlinestruc6, i64 %g0, 5
+  %inlinestruc8 = insertvalue %inlineT %inlinestruc7, i64 %h0, 6
+  %inlinestruc9 = insertvalue %inlineT %inlinestruc8, i64 %i0, 7
+  %inlinestruc10 = insertvalue %inlineT %inlinestruc9, i64 %j0, 8
+  %inlinestruc11 = insertvalue %inlineT %inlinestruc10, i64 %k0, 9
+  %inlinestruc12 = insertvalue %inlineT %inlinestruc11, i64 %l0, 10
+  %inlinestruc13 = insertvalue %inlineT %inlinestruc12, i64 %m0, 11
+  %inlinestruc14 = insertvalue %inlineT %inlinestruc13, i64 %n0, 12
+  %inlinestruc15 = insertvalue %inlineT %inlinestruc14, i64 %o0, 13
+  %inlinestruc16 = insertvalue %inlineT %inlinestruc15, i64 %p0, 14
+  %inlinestruc17 = insertvalue %inlineT %inlinestruc16, i64 %q0, 15
+  %noinlinecall1 = call noundef i64 @non_inlining_call(%noinlineT noundef %noinlinestruc16)
+  %add = add i64 0, %noinlinecall1
+  %noinlinecall2 = call noundef i64 @non_inlining_call(%noinlineT noundef %noinlinestruc16)
+  %add2 = add i64 %add, %noinlinecall2
+  %inlinecall1 = call noundef i64 @inlining_call(%inlineT noundef %inlinestruc17)
+  %add3 = add i64 %add2, %inlinecall1
+  %inlinecall2 = call noundef i64 @inlining_call(%inlineT noundef %inlinestruc17)
+  %add4 = add i64 %add3, %inlinecall2
+  ret i64 %add4
+}

diff  --git a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-vector-ptr-argument.ll b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-vector-ptr-argument.ll
new file mode 100644
index 0000000000000..bbb26a36b2e2d
--- /dev/null
+++ b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-vector-ptr-argument.ll
@@ -0,0 +1,118 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=inline -inline-cost-full=true -inline-threshold=0 -inline-instr-cost=5 -inline-call-penalty=0 -debug-only=inline < %s 2>&1 | FileCheck %s
+
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall1 = call noundef i64 @non_inlining_call
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall2 = call noundef i64 @non_inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall1 = call noundef i64 @inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall2 = call noundef i64 @inlining_call
+
+define noundef i64 @non_inlining_call(<2 x ptr> noundef %ptrvec, ptr noundef %ptrc0, ptr noundef %ptrd0, ptr noundef %ptre0, ptr noundef %ptrf0, ptr noundef %ptrg0, ptr noundef %ptrh0, ptr noundef %ptri0, ptr noundef %ptrj0, ptr noundef %ptrk0, ptr noundef %ptrl0, ptr noundef %ptrm0, ptr noundef %ptrn0, ptr noundef %ptro0, ptr noundef %ptrp0) {
+entry:
+  %ptra0 = extractelement <2 x ptr> %ptrvec, i32 0
+  %ptrb0 = extractelement <2 x ptr> %ptrvec, i32 1
+  %a0 = load i64, ptr %ptra0, align 8
+  %b0 = load i64, ptr %ptrb0, align 8
+  %c0 = load i64, ptr %ptrc0, align 8
+  %d0 = load i64, ptr %ptrd0, align 8
+  %e0 = load i64, ptr %ptre0, align 8
+  %f0 = load i64, ptr %ptrf0, align 8
+  %g0 = load i64, ptr %ptrg0, align 8
+  %h0 = load i64, ptr %ptrh0, align 8
+  %i0 = load i64, ptr %ptri0, align 8
+  %j0 = load i64, ptr %ptrj0, align 8
+  %k0 = load i64, ptr %ptrk0, align 8
+  %l0 = load i64, ptr %ptrl0, align 8
+  %m0 = load i64, ptr %ptrm0, align 8
+  %n0 = load i64, ptr %ptrn0, align 8
+  %o0 = load i64, ptr %ptro0, align 8
+  %p0 = load i64, ptr %ptrp0, align 8
+  %xor = xor i64 %a0, %b0
+  %xor1 = xor i64 %xor, %c0
+  %xor2 = xor i64 %xor1, %d0
+  %xor3 = xor i64 %xor2, %e0
+  %xor4 = xor i64 %xor3, %f0
+  %xor5 = xor i64 %xor4, %g0
+  %xor6 = xor i64 %xor5, %h0
+  %xor7 = xor i64 %xor6, %i0
+  %xor8 = xor i64 %xor7, %j0
+  %xor9 = xor i64 %xor8, %k0
+  %xor10 = xor i64 %xor9, %l0
+  %xor11 = xor i64 %xor10, %m0
+  %xor12 = xor i64 %xor11, %n0
+  %xor13 = xor i64 %xor12, %o0
+  %xor14 = xor i64 %xor13, %p0
+  ret i64 %xor14
+}
+
+define noundef i64 @inlining_call(<2 x ptr> noundef %ptrvec, ptr noundef %ptrc0, ptr noundef %ptrd0, ptr noundef %ptre0, ptr noundef %ptrf0, ptr noundef %ptrg0, ptr noundef %ptrh0, ptr noundef %ptri0, ptr noundef %ptrj0, ptr noundef %ptrk0, ptr noundef %ptrl0, ptr noundef %ptrm0, ptr noundef %ptrn0, ptr noundef %ptro0, ptr noundef %ptrp0, ptr noundef %ptrq0) {
+entry:
+  %ptra0 = extractelement <2 x ptr> %ptrvec, i32 0
+  %ptrb0 = extractelement <2 x ptr> %ptrvec, i32 1
+  %a0 = load i64, ptr %ptra0, align 8
+  %b0 = load i64, ptr %ptrb0, align 8
+  %c0 = load i64, ptr %ptrc0, align 8
+  %d0 = load i64, ptr %ptrd0, align 8
+  %e0 = load i64, ptr %ptre0, align 8
+  %f0 = load i64, ptr %ptrf0, align 8
+  %g0 = load i64, ptr %ptrg0, align 8
+  %h0 = load i64, ptr %ptrh0, align 8
+  %i0 = load i64, ptr %ptri0, align 8
+  %j0 = load i64, ptr %ptrj0, align 8
+  %k0 = load i64, ptr %ptrk0, align 8
+  %l0 = load i64, ptr %ptrl0, align 8
+  %m0 = load i64, ptr %ptrm0, align 8
+  %n0 = load i64, ptr %ptrn0, align 8
+  %o0 = load i64, ptr %ptro0, align 8
+  %p0 = load i64, ptr %ptrp0, align 8
+  %q0 = load i64, ptr %ptrq0, align 8
+  %xor = xor i64 %a0, %b0
+  %xor1 = xor i64 %xor, %c0
+  %xor2 = xor i64 %xor1, %d0
+  %xor3 = xor i64 %xor2, %e0
+  %xor4 = xor i64 %xor3, %f0
+  %xor5 = xor i64 %xor4, %g0
+  %xor6 = xor i64 %xor5, %h0
+  %xor7 = xor i64 %xor6, %i0
+  %xor8 = xor i64 %xor7, %j0
+  %xor9 = xor i64 %xor8, %k0
+  %xor10 = xor i64 %xor9, %l0
+  %xor11 = xor i64 %xor10, %m0
+  %xor12 = xor i64 %xor11, %n0
+  %xor13 = xor i64 %xor12, %o0
+  %xor14 = xor i64 %xor13, %p0
+  %xor15 = xor i64 %xor14, %q0
+  ret i64 %xor15
+}
+
+; Calling each (non-)inlining function twice to make sure they won't get the sole call inlining cost bonus. 
+define i64 @Caller(ptr noundef %in) {
+entry:
+  %a0 = getelementptr inbounds i64, ptr %in, i64 0
+  %b0 = getelementptr inbounds i64, ptr %in, i64 1
+  %vec0 = insertelement <2 x ptr> undef, ptr %a0, i32 0
+  %vec1 = insertelement <2 x ptr> %vec0, ptr %b0, i32 0
+  %c0 = getelementptr inbounds i64, ptr %in, i64 2
+  %d0 = getelementptr inbounds i64, ptr %in, i64 3
+  %e0 = getelementptr inbounds i64, ptr %in, i64 4
+  %f0 = getelementptr inbounds i64, ptr %in, i64 5
+  %g0 = getelementptr inbounds i64, ptr %in, i64 6
+  %h0 = getelementptr inbounds i64, ptr %in, i64 7
+  %i0 = getelementptr inbounds i64, ptr %in, i64 8
+  %j0 = getelementptr inbounds i64, ptr %in, i64 9
+  %k0 = getelementptr inbounds i64, ptr %in, i64 10
+  %l0 = getelementptr inbounds i64, ptr %in, i64 11
+  %m0 = getelementptr inbounds i64, ptr %in, i64 12
+  %n0 = getelementptr inbounds i64, ptr %in, i64 13
+  %o0 = getelementptr inbounds i64, ptr %in, i64 14
+  %p0 = getelementptr inbounds i64, ptr %in, i64 15
+  %q0 = getelementptr inbounds i64, ptr %in, i64 16
+  %noinlinecall1 = call noundef i64 @non_inlining_call(<2 x ptr> noundef %vec1, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0)
+  %add = add i64 0, %noinlinecall1
+  %noinlinecall2 = call noundef i64 @non_inlining_call(<2 x ptr> noundef %vec1, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0)
+  %add2 = add i64 %add, %noinlinecall2
+  %inlinecall1 = call noundef i64 @inlining_call(<2 x ptr> noundef %vec1, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0, ptr noundef %q0)
+  %add3 = add i64 %add2, %inlinecall1
+  %inlinecall2 = call noundef i64 @inlining_call(<2 x ptr> noundef %vec1, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0, ptr noundef %q0)
+  %add4 = add i64 %add3, %inlinecall2
+  ret i64 %add4
+}