[clang] [llvm] [BPF] Support Stack Arguments (PR #189060)

via cfe-commits cfe-commits at lists.llvm.org
Fri Mar 27 10:48:18 PDT 2026


https://github.com/yonghong-song updated https://github.com/llvm/llvm-project/pull/189060

>From d743d11360fd18ea283d751c084a1b0232f358dc Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song at linux.dev>
Date: Wed, 18 Mar 2026 13:29:09 -0700
Subject: [PATCH] [BPF] Support Stack Arguments

Currently, bpf program and kfunc only support 5 register parameters.
As bpf community and use cases keep expanding, there are some need
to extend 5 register parameters by allocating additional parameters
on stack. There are two main use cases here:
  1. Currently kfunc is limited to 5 register parameters. In some
     special situation, people may want to have more than 5
     parameters. One of example is for sched_ext.
  2. Allowing more stack parameters can make bpf prog writer easier
     since they do not need to carefully limit the number of
     parameters for their programs.

The following is the high-level design:
  - Use bpf register R12 as the frame pointer to stack parameters.
    This is to avoid mixing stacks due to R10.
  - Stack parameters must be after 5 register parameters.
  - All parameters should be at most 16 bytes as ByVal parameters
    are not supported.
  - Support for cpu v1 to v4 so all cpu versions can use this.
    A feature macro __BPF_FEATURE_STACK_ARGUMENT is defined
    and users can check whether stack argument is supported or not.

The below is a simple asm code example about stack parameters:

  bar:
    /* Retrieve two parameters from the caller of bar(). */
    rX = *(u64 *)(r12 - 8)
    rY = *(u64 *)(r12 - 16)
    ...
    /* Prepare the single stack parameters for foo1 */
    *(u64 *)(r12 - 24) = rZ
    call foo1
    ...
    /* Prepare the single stack parameters for foo2 */
    *(u64 *)(r12 - 24) = rX
    *(u64 *)(r12 - 32) = rY
    call foo2
    ...
  foo1:
    /* Retrieve parameter '*(u64 *)(r12 - 24) = rZ' from bar(),
     * and assign the value rZ to rX.
     */
    rX = *(u64 *)(r12 - 8)
    ...
  foo2:
    /* Retrieve parameters '*(u64 *)(r12 - 24/32) = rZ' from bar(),
     * and assign values rX/rY to rU/rV.
     */
    rU = *(u64 *)(r12 - 8)
    rV = *(u64 *)(r12 - 16)
    ...

The special handling of r12 will be in kernel bpf jit, which will
actually allocate space for stack parameters. For example, for function
bar(), jit could allocate 16 byte stack parameter space to cover
stores for 'r12 - {24,32}' (maximum between foo1 and foo2). The
foo1() and foo2() can retrieve values from caller allocated
parameter space.
   bar (*(u64 *)(r12 - 24) = rZ) -> foo1 (rX = *(u64 *)(r12 - 8))
   bar (*(u64 *)(r12 - 24) = rX) -> foo2 (rU = *(u64 *)(r12 - 8))
   bar (*(u64 *)(r12 - 32) = rY) -> foo2 (rV = *(u64 *)(r12 - 16))

Internally in bpf backend, pseudo insns are generated for
load_stack_arg and store_stack_arg. The BPFMIPeephole pass
changes pseudo insns into proper real bpf insns like the above.
---
 clang/lib/Basic/Targets/BPF.cpp               |   1 +
 .../test/Preprocessor/bpf-predefined-macros.c |   8 +
 llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp       |  32 +++
 llvm/lib/Target/BPF/BPFISelLowering.cpp       | 118 ++++++++---
 llvm/lib/Target/BPF/BPFInstrInfo.td           |  32 +++
 llvm/lib/Target/BPF/BPFMIPeephole.cpp         |  60 ++++++
 llvm/lib/Target/BPF/BPFMachineFunctionInfo.h  |  20 ++
 llvm/lib/Target/BPF/BPFRegisterInfo.cpp       |   1 +
 llvm/lib/Target/BPF/BPFRegisterInfo.td        |   4 +-
 llvm/lib/Target/BPF/BPFTargetMachine.cpp      |   8 +
 llvm/lib/Target/BPF/BPFTargetMachine.h        |   4 +
 .../BPF/Disassembler/BPFDisassembler.cpp      |  14 +-
 llvm/test/CodeGen/BPF/many_args1.ll           |   6 +-
 llvm/test/CodeGen/BPF/many_args2.ll           |   6 +-
 llvm/test/CodeGen/BPF/many_args3.ll           | 196 ++++++++++++++++++
 llvm/test/CodeGen/BPF/many_args4.ll           |  62 ++++++
 llvm/test/CodeGen/BPF/many_args5.ll           |  22 ++
 llvm/test/CodeGen/BPF/many_args6.ll           |  23 ++
 llvm/test/CodeGen/BPF/many_args7.ll           |  36 ++++
 19 files changed, 606 insertions(+), 47 deletions(-)
 create mode 100644 llvm/lib/Target/BPF/BPFMachineFunctionInfo.h
 create mode 100644 llvm/test/CodeGen/BPF/many_args3.ll
 create mode 100644 llvm/test/CodeGen/BPF/many_args4.ll
 create mode 100644 llvm/test/CodeGen/BPF/many_args5.ll
 create mode 100644 llvm/test/CodeGen/BPF/many_args6.ll
 create mode 100644 llvm/test/CodeGen/BPF/many_args7.ll

diff --git a/clang/lib/Basic/Targets/BPF.cpp b/clang/lib/Basic/Targets/BPF.cpp
index 8de1083d758c7..100769ea4cdb1 100644
--- a/clang/lib/Basic/Targets/BPF.cpp
+++ b/clang/lib/Basic/Targets/BPF.cpp
@@ -46,6 +46,7 @@ void BPFTargetInfo::getTargetDefines(const LangOptions &Opts,
   Builder.defineMacro("__BPF_FEATURE_ADDR_SPACE_CAST");
   Builder.defineMacro("__BPF_FEATURE_MAY_GOTO");
   Builder.defineMacro("__BPF_FEATURE_ATOMIC_MEM_ORDERING");
+  Builder.defineMacro("__BPF_FEATURE_STACK_ARGUMENT");
 
   if (CPU.empty())
     CPU = "v3";
diff --git a/clang/test/Preprocessor/bpf-predefined-macros.c b/clang/test/Preprocessor/bpf-predefined-macros.c
index b4e37fdd7de37..90287b7b24e95 100644
--- a/clang/test/Preprocessor/bpf-predefined-macros.c
+++ b/clang/test/Preprocessor/bpf-predefined-macros.c
@@ -76,6 +76,9 @@ int w;
 #ifdef __BPF_FEATURE_ATOMIC_MEM_ORDERING
 int x;
 #endif
+#ifdef __BPF_FEATURE_STACK_ARGUMENT
+int y;
+#endif
 
 // CHECK: int b;
 // CHECK: int c;
@@ -123,6 +126,11 @@ int x;
 // CPU_V3: int x;
 // CPU_V4: int x;
 
+// CPU_V1: int y;
+// CPU_V2: int y;
+// CPU_V3: int y;
+// CPU_V4: int y;
+
 // CPU_GENERIC: int g;
 
 // CPU_PROBE: int f;
diff --git a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
index dadba52de4627..af034b134e632 100644
--- a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -193,6 +193,38 @@ void BPFDAGToDAGISel::Select(SDNode *Node) {
   switch (Opcode) {
   default:
     break;
+  case BPFISD::LOAD_STACK_ARG: {
+    SDValue Chain = Node->getOperand(0);
+    auto *CN = cast<ConstantSDNode>(Node->getOperand(1));
+    SDValue Off =
+        CurDAG->getTargetConstant(CN->getSExtValue(), SDLoc(Node), MVT::i64);
+    EVT ValVT = Node->getValueType(0);
+    CurDAG->SelectNodeTo(Node, BPF::LOAD_STACK_ARG_PSEUDO, ValVT, MVT::Other,
+                         Off, Chain);
+    return;
+  }
+
+  case BPFISD::STORE_STACK_ARG: {
+    SDValue Chain = Node->getOperand(0);
+    auto *CN = cast<ConstantSDNode>(Node->getOperand(1));
+    SDValue Off =
+        CurDAG->getTargetConstant(CN->getSExtValue(), SDLoc(Node), MVT::i64);
+    SDValue Val = Node->getOperand(2);
+
+    // Use store-immediate when the value is a constant that fits in 32 bits.
+    if (auto *ValCN = dyn_cast<ConstantSDNode>(Val);
+        ValCN && Subtarget->hasStoreImm() && isInt<32>(ValCN->getSExtValue())) {
+      SDValue Imm = CurDAG->getTargetConstant(ValCN->getSExtValue(),
+                                              SDLoc(Node), MVT::i64);
+      CurDAG->SelectNodeTo(Node, BPF::STORE_STACK_ARG_IMM_PSEUDO, MVT::Other,
+                           Off, Imm, Chain);
+    } else {
+      CurDAG->SelectNodeTo(Node, BPF::STORE_STACK_ARG_PSEUDO, MVT::Other, Off,
+                           Val, Chain);
+    }
+    return;
+  }
+
   case ISD::FrameIndex: {
     int FI = cast<FrameIndexSDNode>(Node)->getIndex();
     EVT VT = Node->getValueType(0);
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
index fd26345920a71..010b47e20925d 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -13,6 +13,7 @@
 
 #include "BPFISelLowering.h"
 #include "BPF.h"
+#include "BPFMachineFunctionInfo.h"
 #include "BPFSubtarget.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -376,6 +377,37 @@ SDValue BPFTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 // Calling Convention Implementation
 #include "BPFGenCallingConv.inc"
 
+// Check if any aggregate argument is split between registers and stack.
+template <typename GetOrigIdxFn>
+static bool hasSplitArg(const SmallVectorImpl<CCValAssign> &ArgLocs,
+                        GetOrigIdxFn GetOrigIdx) {
+  for (size_t I = 0; I < ArgLocs.size(); ++I) {
+    if (!ArgLocs[I].isMemLoc())
+      continue;
+    unsigned OrigIdx = GetOrigIdx(I);
+    // First argument always gets a register so 'I' must be greater than 0.
+    unsigned J = I - 1;
+    if (GetOrigIdx(J) == OrigIdx && ArgLocs[J].isRegLoc())
+      return true;
+  }
+  return false;
+}
+
+// Apply AssertSext/AssertZext and truncate based on VA's LocInfo.
+static SDValue convertLocValType(SelectionDAG &DAG, const SDLoc &DL,
+                                 const CCValAssign &VA, EVT RegVT,
+                                 SDValue ArgValue) {
+  if (VA.getLocInfo() == CCValAssign::SExt)
+    ArgValue = DAG.getNode(ISD::AssertSext, DL, RegVT, ArgValue,
+                           DAG.getValueType(VA.getValVT()));
+  else if (VA.getLocInfo() == CCValAssign::ZExt)
+    ArgValue = DAG.getNode(ISD::AssertZext, DL, RegVT, ArgValue,
+                           DAG.getValueType(VA.getValVT()));
+  if (VA.getLocInfo() != CCValAssign::Full)
+    ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
+  return ArgValue;
+}
+
 SDValue BPFTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -396,13 +428,16 @@ SDValue BPFTargetLowering::LowerFormalArguments(
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
   CCInfo.AnalyzeFormalArguments(Ins, getHasAlu32() ? CC_BPF32 : CC_BPF64);
 
-  bool HasMemArgs = false;
+  if (hasSplitArg(ArgLocs, [&](size_t I) { return Ins[I].getOrigArgIndex(); }))
+    fail(DL, DAG, "aggregate argument is split between registers and stack");
+
+  int IncomingExtDepth = 0;
   for (size_t I = 0; I < ArgLocs.size(); ++I) {
     auto &VA = ArgLocs[I];
+    EVT RegVT = VA.getLocVT();
 
     if (VA.isRegLoc()) {
       // Arguments passed in registers
-      EVT RegVT = VA.getLocVT();
       MVT::SimpleValueType SimpleTy = RegVT.getSimpleVT().SimpleTy;
       switch (SimpleTy) {
       default: {
@@ -419,33 +454,34 @@ SDValue BPFTargetLowering::LowerFormalArguments(
             SimpleTy == MVT::i64 ? &BPF::GPRRegClass : &BPF::GPR32RegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
         SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, RegVT);
+        InVals.push_back(convertLocValType(DAG, DL, VA, RegVT, ArgValue));
+        break;
+      }
+      continue;
+    }
 
-        // If this is an value that has been promoted to wider types, insert an
-        // assert[sz]ext to capture this, then truncate to the right size.
-        if (VA.getLocInfo() == CCValAssign::SExt)
-          ArgValue = DAG.getNode(ISD::AssertSext, DL, RegVT, ArgValue,
-                                 DAG.getValueType(VA.getValVT()));
-        else if (VA.getLocInfo() == CCValAssign::ZExt)
-          ArgValue = DAG.getNode(ISD::AssertZext, DL, RegVT, ArgValue,
-                                 DAG.getValueType(VA.getValVT()));
-
-        if (VA.getLocInfo() != CCValAssign::Full)
-          ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
-
-        InVals.push_back(ArgValue);
-
+    if (VA.isMemLoc()) {
+      IncomingExtDepth = VA.getLocMemOffset() + 8;
+      int Off = -IncomingExtDepth;
+      if (Off < INT16_MIN) {
+        fail(DL, DAG, "extra parameter stack depth exceeded limit");
         break;
       }
-    } else {
-      if (VA.isMemLoc())
-        HasMemArgs = true;
-      else
-        report_fatal_error("unhandled argument location");
-      InVals.push_back(DAG.getConstant(0, DL, VA.getLocVT()));
+
+      // Physical extra argument slot is always 64-bit.
+      SDValue StackVal = DAG.getNode(BPFISD::LOAD_STACK_ARG, DL,
+                                     DAG.getVTList(MVT::i64, MVT::Other), Chain,
+                                     DAG.getConstant(Off, DL, MVT::i64));
+      SDValue ArgValue = StackVal.getValue(0);
+      Chain = StackVal.getValue(1);
+      InVals.push_back(convertLocValType(DAG, DL, VA, MVT::i64, ArgValue));
+      continue;
     }
   }
-  if (HasMemArgs)
-    fail(DL, DAG, "stack arguments are not supported");
+
+  auto &BFI = *MF.getInfo<BPFMachineFunctionInfo>();
+  BFI.setIncomingExtDepth(IncomingExtDepth);
+
   if (IsVarArg)
     fail(DL, DAG, "variadic functions are not supported");
   if (MF.getFunction().hasStructRetAttr())
@@ -454,8 +490,6 @@ SDValue BPFTargetLowering::LowerFormalArguments(
   return Chain;
 }
 
-const size_t BPFTargetLowering::MaxArgs = 5;
-
 static void resetRegMaskBit(const TargetRegisterInfo *TRI, uint32_t *RegMask,
                             MCRegister Reg) {
   for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg))
@@ -503,8 +537,9 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   unsigned NumBytes = CCInfo.getStackSize();
 
-  if (Outs.size() > MaxArgs)
-    fail(CLI.DL, DAG, "too many arguments", Callee);
+  if (hasSplitArg(ArgLocs, [&](size_t I) { return Outs[I].OrigArgIndex; }))
+    fail(CLI.DL, DAG, "aggregate argument is split between registers and stack",
+         Callee);
 
   for (auto &Arg : Outs) {
     ISD::ArgFlagsTy Flags = Arg.Flags;
@@ -517,10 +552,12 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   auto PtrVT = getPointerTy(MF.getDataLayout());
   Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL);
 
-  SmallVector<std::pair<unsigned, SDValue>, MaxArgs> RegsToPass;
+  SmallVector<std::pair<unsigned, SDValue>, 16> RegsToPass;
+  auto &BFI = *MF.getInfo<BPFMachineFunctionInfo>();
+  int Base = BFI.getIncomingExtDepth() + 8;
 
   // Walk arg assignments
-  for (size_t i = 0; i < std::min(ArgLocs.size(), MaxArgs); ++i) {
+  for (size_t i = 0; i < OutVals.size(); ++i) {
     CCValAssign &VA = ArgLocs[i];
     SDValue &Arg = OutVals[i];
 
@@ -542,10 +579,25 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     }
 
     // Push arguments into RegsToPass vector
-    if (VA.isRegLoc())
+    if (VA.isRegLoc()) {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
-    else
-      report_fatal_error("stack arguments are not supported");
+      continue;
+    }
+
+    if (VA.isMemLoc()) {
+      int Off = -(Base + VA.getLocMemOffset());
+      if (Off < INT16_MIN) {
+        fail(CLI.DL, DAG, "extra parameter stack depth exceeded limit");
+        break;
+      }
+
+      SDValue OffVal = DAG.getConstant(Off, CLI.DL, MVT::i64);
+      Chain = DAG.getNode(BPFISD::STORE_STACK_ARG, CLI.DL, MVT::Other, Chain,
+                          OffVal, Arg);
+      continue;
+    }
+
+    report_fatal_error("unhandled argument location");
   }
 
   SDValue InGlue;
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index 3d2050e26ca0d..8d95365dbc463 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -43,6 +43,22 @@ def BPFcallseq_end  : SDNode<"ISD::CALLSEQ_END",   SDT_BPFCallSeqEnd,
                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 def BPFbrcc         : SDNode<"BPFISD::BR_CC", SDT_BPFBrCC, [SDNPHasChain]>;
 
+def SDT_BPFLoadStackArg : SDTypeProfile<1, 1, [
+  SDTCisVT<0, i64>,  // result value
+  SDTCisVT<1, i64>   // operand: offset
+]>;
+def SDT_BPFStoreStackArg : SDTypeProfile<0, 2, [
+  SDTCisVT<0, i64>,  // operand 0: offset
+  SDTCisVT<1, i64>   // operand 1: stored value
+]>;
+
+def BPFload_stack_arg : SDNode<"BPFISD::LOAD_STACK_ARG",
+                               SDT_BPFLoadStackArg,
+                               [SDNPHasChain, SDNPMayLoad]>;
+def BPFstore_stack_arg : SDNode<"BPFISD::STORE_STACK_ARG",
+                                SDT_BPFStoreStackArg,
+                                [SDNPHasChain, SDNPMayStore]>;
+
 def BPFselectcc     : SDNode<"BPFISD::SELECT_CC", SDT_BPFSelectCC>;
 def BPFWrapper      : SDNode<"BPFISD::Wrapper", SDT_BPFWrapper>;
 def BPFmemcpy       : SDNode<"BPFISD::MEMCPY", SDT_BPFMEMCPY,
@@ -300,6 +316,22 @@ let Predicates = [BPFHasGotox] in {
 }
 }
 
+let hasSideEffects = 1, mayLoad = 1 in {
+  def LOAD_STACK_ARG_PSEUDO
+    : Pseudo<(outs GPR:$dst), (ins s16imm:$off),
+             "load_stack_arg\t$dst, $off", []>;
+}
+
+let hasSideEffects = 1, mayStore = 1 in {
+  def STORE_STACK_ARG_PSEUDO
+    : Pseudo<(outs), (ins s16imm:$off, GPR:$src),
+             "store_stack_arg\t$off, $src", []>;
+
+  def STORE_STACK_ARG_IMM_PSEUDO
+    : Pseudo<(outs), (ins s16imm:$off, u64imm:$val),
+             "store_stack_arg_imm\t$off, $val", []>;
+}
+
 // ALU instructions
 class ALU_RI<BPFOpClass Class, BPFArithOp Opc, int off,
              dag outs, dag ins, string asmstr, list<dag> pattern>
diff --git a/llvm/lib/Target/BPF/BPFMIPeephole.cpp b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
index b8e4db78955f5..591b0b2ef28a9 100644
--- a/llvm/lib/Target/BPF/BPFMIPeephole.cpp
+++ b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
@@ -323,6 +323,7 @@ struct BPFMIPreEmitPeephole : public MachineFunctionPass {
   bool insertMissingCallerSavedSpills();
   bool removeMayGotoZero();
   bool addExitAfterUnreachable();
+  bool expandStackArgPseudos();
 
 public:
 
@@ -340,6 +341,7 @@ struct BPFMIPreEmitPeephole : public MachineFunctionPass {
     Changed |= insertMissingCallerSavedSpills();
     Changed |= removeMayGotoZero();
     Changed |= addExitAfterUnreachable();
+    Changed |= expandStackArgPseudos();
     return Changed;
   }
 };
@@ -752,6 +754,64 @@ bool BPFMIPreEmitPeephole::addExitAfterUnreachable() {
   return true;
 }
 
+bool BPFMIPreEmitPeephole::expandStackArgPseudos() {
+  bool Changed = false;
+
+  for (MachineBasicBlock &MBB : *MF) {
+    for (auto It = MBB.begin(), End = MBB.end(); It != End;) {
+      MachineInstr &MI = *It++;
+      DebugLoc DL = MI.getDebugLoc();
+
+      switch (MI.getOpcode()) {
+      default:
+        break;
+
+      case BPF::LOAD_STACK_ARG_PSEUDO: {
+        Register DstReg = MI.getOperand(0).getReg();
+        int16_t Off = MI.getOperand(1).getImm();
+
+        BuildMI(MBB, MI, DL, TII->get(BPF::LDD), DstReg)
+            .addReg(BPF::R12)
+            .addImm(Off);
+        MI.eraseFromParent();
+        Changed = true;
+        break;
+      }
+
+      case BPF::STORE_STACK_ARG_PSEUDO: {
+        int16_t Off = MI.getOperand(0).getImm();
+        const MachineOperand &SrcMO = MI.getOperand(1);
+        Register SrcReg = SrcMO.getReg();
+        bool IsKill = SrcMO.isKill();
+
+        BuildMI(MBB, MI, DL, TII->get(BPF::STD))
+            .addReg(SrcReg, getKillRegState(IsKill))
+            .addReg(BPF::R12)
+            .addImm(Off);
+        MI.eraseFromParent();
+        Changed = true;
+        break;
+      }
+
+      case BPF::STORE_STACK_ARG_IMM_PSEUDO: {
+        int16_t Off = MI.getOperand(0).getImm();
+        int32_t Val = MI.getOperand(1).getImm();
+
+        BuildMI(MBB, MI, DL, TII->get(BPF::STD_imm))
+            .addImm(Val)
+            .addReg(BPF::R12)
+            .addImm(Off);
+        MI.eraseFromParent();
+        Changed = true;
+        break;
+      }
+      }
+    }
+  }
+
+  return Changed;
+}
+
 } // end default namespace
 
 INITIALIZE_PASS(BPFMIPreEmitPeephole, "bpf-mi-pemit-peephole",
diff --git a/llvm/lib/Target/BPF/BPFMachineFunctionInfo.h b/llvm/lib/Target/BPF/BPFMachineFunctionInfo.h
new file mode 100644
index 0000000000000..9c66242015066
--- /dev/null
+++ b/llvm/lib/Target/BPF/BPFMachineFunctionInfo.h
@@ -0,0 +1,20 @@
+#ifndef LLVM_LIB_TARGET_BPF_BPFMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_BPF_BPFMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+class BPFMachineFunctionInfo : public MachineFunctionInfo {
+  int IncomingExtDepth = 0;
+
+public:
+  BPFMachineFunctionInfo(const Function &F, const BPFSubtarget *STI) {}
+
+  int getIncomingExtDepth() const { return IncomingExtDepth; }
+  void setIncomingExtDepth(int Depth) { IncomingExtDepth = Depth; }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/BPF/BPFRegisterInfo.cpp b/llvm/lib/Target/BPF/BPFRegisterInfo.cpp
index f8e3fcdac954b..ab01eab2bd54a 100644
--- a/llvm/lib/Target/BPF/BPFRegisterInfo.cpp
+++ b/llvm/lib/Target/BPF/BPFRegisterInfo.cpp
@@ -54,6 +54,7 @@ BitVector BPFRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   markSuperRegs(Reserved, BPF::W10); // [W|R]10 is read only frame pointer
   markSuperRegs(Reserved, BPF::W11); // [W|R]11 is pseudo stack pointer
+  markSuperRegs(Reserved, BPF::W12); // [W|R]12 is extra argument area pointer
   return Reserved;
 }
 
diff --git a/llvm/lib/Target/BPF/BPFRegisterInfo.td b/llvm/lib/Target/BPF/BPFRegisterInfo.td
index abeef5dc8aad2..4359087b0f986 100644
--- a/llvm/lib/Target/BPF/BPFRegisterInfo.td
+++ b/llvm/lib/Target/BPF/BPFRegisterInfo.td
@@ -28,7 +28,7 @@ class Ri<bits<16> Enc, string n, list<Register> subregs>
   let SubRegIndices = [sub_32];
 }
 
-foreach I = 0-11 in {
+foreach I = 0-12 in {
   // 32-bit Integer (alias to low part of 64-bit register).
   def W#I  : Wi<I,  "w"#I>,  DwarfRegNum<[I]>;
   // 64-bit Integer registers
@@ -39,6 +39,7 @@ foreach I = 0-11 in {
 def GPR32 : RegisterClass<"BPF", [i32], 64, (add
   (sequence "W%u", 1, 9),
   W0, // Return value
+  W12, // Extra Argument Ptr
   W11, // Stack Ptr
   W10  // Frame Ptr
 )>;
@@ -46,6 +47,7 @@ def GPR32 : RegisterClass<"BPF", [i32], 64, (add
 def GPR : RegisterClass<"BPF", [i64], 64, (add
   (sequence "R%u", 1, 9),
   R0, // Return value
+  R12, // Extra Argument Ptr
   R11, // Stack Ptr
   R10  // Frame Ptr
 )>;
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index ad3df2c879fe7..f92ed0dae22ca 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -12,6 +12,7 @@
 
 #include "BPFTargetMachine.h"
 #include "BPF.h"
+#include "BPFMachineFunctionInfo.h"
 #include "BPFTargetLoweringObjectFile.h"
 #include "BPFTargetTransformInfo.h"
 #include "MCTargetDesc/BPFMCAsmInfo.h"
@@ -209,3 +210,10 @@ bool BPFPassConfig::addGlobalInstructionSelect() {
   addPass(new InstructionSelect(getOptLevel()));
   return false;
 }
+
+MachineFunctionInfo *BPFTargetMachine::createMachineFunctionInfo(
+    BumpPtrAllocator &Allocator, const Function &F,
+    const TargetSubtargetInfo *STI) const {
+  return BPFMachineFunctionInfo::create<BPFMachineFunctionInfo>(
+      Allocator, F, static_cast<const BPFSubtarget *>(STI));
+}
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.h b/llvm/lib/Target/BPF/BPFTargetMachine.h
index 15230bd06dadb..9f6c53f20f18f 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.h
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.h
@@ -43,6 +43,10 @@ class BPFTargetMachine : public CodeGenTargetMachineImpl {
   }
 
   void registerPassBuilderCallbacks(PassBuilder &PB) override;
+
+  MachineFunctionInfo *
+  createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F,
+                            const TargetSubtargetInfo *STI) const override;
 };
 }
 
diff --git a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
index 813dddad7d75f..a87d4cdea31a2 100644
--- a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
+++ b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -97,13 +97,13 @@ LLVMInitializeBPFDisassembler() {
 }
 
 static const unsigned GPRDecoderTable[] = {
-    BPF::R0,  BPF::R1,  BPF::R2,  BPF::R3,  BPF::R4,  BPF::R5,
-    BPF::R6,  BPF::R7,  BPF::R8,  BPF::R9,  BPF::R10, BPF::R11};
+    BPF::R0, BPF::R1, BPF::R2, BPF::R3,  BPF::R4,  BPF::R5, BPF::R6,
+    BPF::R7, BPF::R8, BPF::R9, BPF::R10, BPF::R11, BPF::R12};
 
 static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t /*Address*/,
                                            const MCDisassembler * /*Decoder*/) {
-  if (RegNo > 11)
+  if (RegNo > 12)
     return MCDisassembler::Fail;
 
   unsigned Reg = GPRDecoderTable[RegNo];
@@ -112,13 +112,13 @@ static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static const unsigned GPR32DecoderTable[] = {
-    BPF::W0,  BPF::W1,  BPF::W2,  BPF::W3,  BPF::W4,  BPF::W5,
-    BPF::W6,  BPF::W7,  BPF::W8,  BPF::W9,  BPF::W10, BPF::W11};
+    BPF::W0, BPF::W1, BPF::W2, BPF::W3,  BPF::W4,  BPF::W5, BPF::W6,
+    BPF::W7, BPF::W8, BPF::W9, BPF::W10, BPF::W11, BPF::W12};
 
 static DecodeStatus
 DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t /*Address*/,
                          const MCDisassembler * /*Decoder*/) {
-  if (RegNo > 11)
+  if (RegNo > 12)
     return MCDisassembler::Fail;
 
   unsigned Reg = GPR32DecoderTable[RegNo];
@@ -130,7 +130,7 @@ static DecodeStatus decodeMemoryOpValue(MCInst &Inst, unsigned Insn,
                                         uint64_t Address,
                                         const MCDisassembler *Decoder) {
   unsigned Register = (Insn >> 16) & 0xf;
-  if (Register > 11)
+  if (Register > 12)
     return MCDisassembler::Fail;
 
   Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
diff --git a/llvm/test/CodeGen/BPF/many_args1.ll b/llvm/test/CodeGen/BPF/many_args1.ll
index 0e2ff1af06e55..e3a55deeecae6 100644
--- a/llvm/test/CodeGen/BPF/many_args1.ll
+++ b/llvm/test/CodeGen/BPF/many_args1.ll
@@ -1,6 +1,4 @@
-; RUN: not llc -mtriple=bpf -mcpu=v1 < %s 2> %t1
-; RUN: FileCheck %s < %t1
-; CHECK: error: <unknown>:0:0: in function foo i32 (i32, i32, i32): {{t10|0x[0-f]+}}: i64 = GlobalAddress<ptr @bar> 0 too many arguments
+; RUN: llc -mtriple=bpf -mcpu=v1 < %s | FileCheck %s
 
 ; Function Attrs: nounwind uwtable
 define i32 @foo(i32 %a, i32 %b, i32 %c) #0 {
@@ -9,4 +7,6 @@ entry:
   ret i32 %call
 }
 
+; CHECK: call bar
+
 declare i32 @bar(i32, i32, i32, i32, i32, i32) #1
diff --git a/llvm/test/CodeGen/BPF/many_args2.ll b/llvm/test/CodeGen/BPF/many_args2.ll
index d1feefc0e4047..f35fe5c9bf951 100644
--- a/llvm/test/CodeGen/BPF/many_args2.ll
+++ b/llvm/test/CodeGen/BPF/many_args2.ll
@@ -1,6 +1,4 @@
-; RUN: not llc -mtriple=bpf < %s 2> %t1
-; RUN: FileCheck %s < %t1
-; CHECK: error: <unknown>:0:0: in function bar i32 (i32, i32, i32, i32, i32, i32): stack arguments are not supported
+; RUN: llc -mtriple=bpf < %s | FileCheck %s
 
 ; Function Attrs: nounwind readnone uwtable
 define i32 @bar(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f) #0 {
@@ -8,6 +6,8 @@ entry:
   ret i32 1
 }
 
+; CHECK-LABEL: bar:
+
 ; Function Attrs: nounwind readnone uwtable
 define i32 @foo(i32 %a, i32 %b, i32 %c) #0 {
 entry:
diff --git a/llvm/test/CodeGen/BPF/many_args3.ll b/llvm/test/CodeGen/BPF/many_args3.ll
new file mode 100644
index 0000000000000..b32ab44a86fa8
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/many_args3.ll
@@ -0,0 +1,196 @@
+; RUN: llc < %s -mtriple=bpf -mcpu=v3 | FileCheck %s
+
+; Source code:
+;   struct t { long a; long b; };
+;   long foo1(int a1, int a2, int a3, int a4, int a5, short a6, long a7) {
+;     return a1 + a2 + a3 + a4 + a5 + a6 + a7;
+;   }
+;
+;   long foo2(int a1, int a2, int a3, int a4, int a5, struct t a6, int a7) {
+;     return a1 + a2 + a3 + a4 + a5 + a6.a + a6.b + a7;
+;   }
+;
+;   long foo3(struct t a1, int a2, int a3, int a4, int a5, struct t a6) {
+;     return a1.a + a1.b + a2 + a3 + a4 + a5 + a6.a + a6.b;
+;   }
+;
+;   long foo4(int a1, int a2, int a3, int a4, int a5, struct t a6, struct t a7) {
+;     return a1 + a2 + a3 + a4 + a5 + a6.a + a6.b + a7.a + a7.b;
+;   }
+;
+;   long bar5(int a1, int a2, int a3, int a4, int a5, short a6, long a7);
+;   long foo5(int a1, int a2, int a3) {
+;     return bar5(a1, a2, a3, a2, a3, a1, a2);
+;   }
+;
+;   long bar6(int a1, int a2, int a3, int a4, int a5, struct t a6, int a7);
+;   long foo6(int a1, int a2, int a3) {
+;     struct t tmp = {a1, a2};
+;     return bar6(a1, a2, a3, a2, a3, tmp, a2);
+;   }
+;
+;   long bar7(struct t a1, int a2, int a3, int a4, int a5, struct t a6);
+;   long foo7(int a1, int a2, int a3) {
+;     struct t tmp1 = {a1, a2};
+;     struct t tmp2 = {a2, a3};
+;     return bar7(tmp1, a3, a2, a1, a2, tmp2);
+;   }
+;
+;   long bar8(int a1, int a2, int a3, int a4, int a5, struct t a6, struct t a7);
+;   long foo8(int a1, int a2, int a3) {
+;     struct t tmp1 = { a3, a2 };
+;     struct t  tmp2 = { a2, a3 };
+;     return bar8(a1, a2, a3, a2, a3, tmp1, tmp2);
+;   }
+
+define dso_local i64 @foo1(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, i32 noundef %4, i16 noundef signext %5, i64 noundef %6) local_unnamed_addr {
+  %8 = add nsw i32 %1, %0
+  %9 = add nsw i32 %8, %2
+  %10 = add nsw i32 %9, %3
+  %11 = add nsw i32 %10, %4
+  %12 = sext i16 %5 to i32
+  %13 = add nsw i32 %11, %12
+  %14 = sext i32 %13 to i64
+  %15 = add nsw i64 %6, %14
+  ret i64 %15
+}
+
+; CHECK-LABEL:   foo1:
+; CHECK:         r[[#]] = *(u64 *)(r12 - 8)
+; CHECK:         r[[#]] = *(u64 *)(r12 - 16)
+
+define dso_local i64 @foo2(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, i32 noundef %4, [2 x i64] %5, i32 noundef %6) local_unnamed_addr {
+  %8 = extractvalue [2 x i64] %5, 0
+  %9 = extractvalue [2 x i64] %5, 1
+  %10 = add nsw i32 %1, %0
+  %11 = add nsw i32 %10, %2
+  %12 = add nsw i32 %11, %3
+  %13 = add nsw i32 %12, %4
+  %14 = sext i32 %13 to i64
+  %15 = add nsw i64 %8, %14
+  %16 = add nsw i64 %15, %9
+  %17 = sext i32 %6 to i64
+  %18 = add nsw i64 %16, %17
+  ret i64 %18
+}
+
+; CHECK-LABEL:   foo2:
+; CHECK:         r[[#]] = *(u64 *)(r12 - 8)
+; CHECK:         r[[#]] = *(u64 *)(r12 - 16)
+; CHECK:         r[[#]] = *(u64 *)(r12 - 24)
+
+define dso_local i64 @foo3([2 x i64] %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, i32 noundef %4, [2 x i64] %5) local_unnamed_addr {
+  %7 = extractvalue [2 x i64] %0, 0
+  %8 = extractvalue [2 x i64] %0, 1
+  %9 = extractvalue [2 x i64] %5, 0
+  %10 = extractvalue [2 x i64] %5, 1
+  %11 = add nsw i64 %7, %8
+  %12 = sext i32 %1 to i64
+  %13 = add nsw i64 %11, %12
+  %14 = sext i32 %2 to i64
+  %15 = add nsw i64 %13, %14
+  %16 = sext i32 %3 to i64
+  %17 = add nsw i64 %15, %16
+  %18 = sext i32 %4 to i64
+  %19 = add nsw i64 %17, %18
+  %20 = add nsw i64 %19, %9
+  %21 = add nsw i64 %20, %10
+  ret i64 %21
+}
+
+; CHECK-LABEL:   foo3:
+; CHECK:         r[[#]] = *(u64 *)(r12 - 8)
+; CHECK:         r[[#]] = *(u64 *)(r12 - 16)
+; CHECK:         r[[#]] = *(u64 *)(r12 - 24)
+
+define dso_local i64 @foo4(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, i32 noundef %4, [2 x i64] %5, [2 x i64] %6) local_unnamed_addr {
+  %8 = extractvalue [2 x i64] %5, 0
+  %9 = extractvalue [2 x i64] %5, 1
+  %10 = extractvalue [2 x i64] %6, 0
+  %11 = extractvalue [2 x i64] %6, 1
+  %12 = add nsw i32 %1, %0
+  %13 = add nsw i32 %12, %2
+  %14 = add nsw i32 %13, %3
+  %15 = add nsw i32 %14, %4
+  %16 = sext i32 %15 to i64
+  %17 = add nsw i64 %8, %16
+  %18 = add nsw i64 %17, %9
+  %19 = add nsw i64 %18, %10
+  %20 = add nsw i64 %19, %11
+  ret i64 %20
+}
+
+; CHECK-LABEL:   foo4:
+; CHECK:         r[[#]] = *(u64 *)(r12 - 8)
+; CHECK:         r[[#]] = *(u64 *)(r12 - 16)
+; CHECK:         r[[#]] = *(u64 *)(r12 - 24)
+; CHECK:         r[[#]] = *(u64 *)(r12 - 32)
+
+define dso_local i64 @foo5(i32 noundef %0, i32 noundef %1, i32 noundef %2) local_unnamed_addr {
+  %4 = trunc i32 %0 to i16
+  %5 = sext i32 %1 to i64
+  %6 = tail call i64 @bar5(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %1, i32 noundef %2, i16 noundef signext %4, i64 noundef %5)
+  ret i64 %6
+}
+
+; CHECK-LABEL:   foo5:
+; CHECK:         *(u64 *)(r12 - 8) = r[[#]]
+; CHECK:         *(u64 *)(r12 - 16) = r[[#]]
+
+declare dso_local i64 @bar5(i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, i16 noundef signext, i64 noundef) local_unnamed_addr
+
+define dso_local i64 @foo6(i32 noundef %0, i32 noundef %1, i32 noundef %2) local_unnamed_addr {
+  %4 = sext i32 %0 to i64
+  %5 = sext i32 %1 to i64
+  %6 = insertvalue [2 x i64] poison, i64 %4, 0
+  %7 = insertvalue [2 x i64] %6, i64 %5, 1
+  %8 = tail call i64 @bar6(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %1, i32 noundef %2, [2 x i64] %7, i32 noundef %1)
+  ret i64 %8
+}
+
+; CHECK-LABEL:   foo6:
+; CHECK:         *(u64 *)(r12 - 8) = r[[#]]
+; CHECK:         *(u64 *)(r12 - 16) = r[[#]]
+; CHECK:         *(u64 *)(r12 - 24) = r[[#]]
+
+declare dso_local i64 @bar6(i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, [2 x i64], i32 noundef) local_unnamed_addr
+
+define dso_local i64 @foo7(i32 noundef %0, i32 noundef %1, i32 noundef %2) local_unnamed_addr {
+  %4 = sext i32 %0 to i64
+  %5 = sext i32 %1 to i64
+  %6 = sext i32 %2 to i64
+  %7 = insertvalue [2 x i64] poison, i64 %4, 0
+  %8 = insertvalue [2 x i64] %7, i64 %5, 1
+  %9 = insertvalue [2 x i64] poison, i64 %5, 0
+  %10 = insertvalue [2 x i64] %9, i64 %6, 1
+  %11 = tail call i64 @bar7([2 x i64] %8, i32 noundef %2, i32 noundef %1, i32 noundef %0, i32 noundef %1, [2 x i64] %10)
+  ret i64 %11
+}
+
+; CHECK-LABEL:   foo7:
+; CHECK:         *(u64 *)(r12 - 8) = r[[#]]
+; CHECK:         *(u64 *)(r12 - 16) = r[[#]]
+; CHECK:         *(u64 *)(r12 - 24) = r[[#]]
+
+declare dso_local i64 @bar7([2 x i64], i32 noundef, i32 noundef, i32 noundef, i32 noundef, [2 x i64]) local_unnamed_addr
+
+define dso_local i64 @foo8(i32 noundef %0, i32 noundef %1, i32 noundef %2) local_unnamed_addr {
+  %4 = sext i32 %2 to i64
+  %5 = sext i32 %1 to i64
+  %6 = insertvalue [2 x i64] poison, i64 %4, 0
+  %7 = insertvalue [2 x i64] %6, i64 %5, 1
+  %8 = insertvalue [2 x i64] poison, i64 %5, 0
+  %9 = insertvalue [2 x i64] %8, i64 %4, 1
+  %10 = tail call i64 @bar8(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %1, i32 noundef %2, [2 x i64] %7, [2 x i64] %9)
+  ret i64 %10
+}
+
+; CHECK-LABEL:   foo8:
+; CHECK:         *(u64 *)(r12 - 8) = r[[#]]
+; CHECK:         *(u64 *)(r12 - 16) = r[[#]]
+; CHECK:         *(u64 *)(r12 - 24) = r[[#]]
+; CHECK:         *(u64 *)(r12 - 32) = r[[#]]
+
+; CHECK-NOT:     *(u64 *)(r12 - 40) = r[[#]]
+
+declare dso_local i64 @bar8(i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, [2 x i64], [2 x i64]) local_unnamed_addr
diff --git a/llvm/test/CodeGen/BPF/many_args4.ll b/llvm/test/CodeGen/BPF/many_args4.ll
new file mode 100644
index 0000000000000..3d047e544c0db
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/many_args4.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s -mtriple=bpf -mcpu=v3 | FileCheck %s
+
+; Source code:
+;   __attribute__((noinline)) long foo1(int a, int b, int c, int d, int e, int f) {
+;     return a + b + c + d + e + f;
+;   }
+;
+;  __attribute__((noinline)) long foo2(int a, int b, int c, int d, int e, int f, int g) {
+;    return a + b + c + d + e + f + g;
+;  }
+;
+;  long bar(int a, int b, int c, int d, int e, int f, int g) {
+;    long r1 = foo1(a, b, c, d, e, f + g);
+;    long r2 = foo2(a, b, c, d, e, f, g);
+;    return r1 + r2;
+;  }
+
+
+define dso_local range(i64 -2147483648, 2147483648) i64 @foo1(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, i32 noundef %4, i32 noundef %5) local_unnamed_addr {
+  %7 = add nsw i32 %1, %0
+  %8 = add nsw i32 %7, %2
+  %9 = add nsw i32 %8, %3
+  %10 = add nsw i32 %9, %4
+  %11 = add nsw i32 %10, %5
+  %12 = sext i32 %11 to i64
+  ret i64 %12
+}
+
+; CHECK-LABEL:   foo1:
+; CHECK:         r[[#]] = *(u64 *)(r12 - 8)
+
+define dso_local range(i64 -2147483648, 2147483648) i64 @foo2(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, i32 noundef %4, i32 noundef %5, i32 noundef %6) local_unnamed_addr {
+  %8 = add nsw i32 %1, %0
+  %9 = add nsw i32 %8, %2
+  %10 = add nsw i32 %9, %3
+  %11 = add nsw i32 %10, %4
+  %12 = add nsw i32 %11, %5
+  %13 = add nsw i32 %12, %6
+  %14 = sext i32 %13 to i64
+  ret i64 %14
+}
+
+; CHECK-LABEL:   foo2:
+; CHECK:         r[[#]] = *(u64 *)(r12 - 8)
+; CHECK:         r[[#]] = *(u64 *)(r12 - 16)
+
+define dso_local range(i64 -4294967296, 4294967295) i64 @bar(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, i32 noundef %4, i32 noundef %5, i32 noundef %6) local_unnamed_addr {
+  %8 = add nsw i32 %6, %5
+  %9 = tail call i64 @foo1(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, i32 noundef %4, i32 noundef %8)
+  %10 = tail call i64 @foo2(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, i32 noundef %4, i32 noundef %5, i32 noundef %6)
+  %11 = add nsw i64 %10, %9
+  ret i64 %11
+}
+
+; CHECK-LABEL:   bar:
+; CHECK:         r[[#]] = *(u64 *)(r12 - 8)
+; CHECK:         r[[#]] = *(u64 *)(r12 - 16)
+; CHECK:         *(u64 *)(r12 - 24) = r[[#]]
+; CHECK:         call foo1
+; CHECK:         *(u64 *)(r12 - 24) = r[[#]]
+; CHECK:         *(u64 *)(r12 - 32) = r[[#]]
+; CHECK:         call foo2
diff --git a/llvm/test/CodeGen/BPF/many_args5.ll b/llvm/test/CodeGen/BPF/many_args5.ll
new file mode 100644
index 0000000000000..af4bf94e0f742
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/many_args5.ll
@@ -0,0 +1,22 @@
+; RUN: not llc -mtriple=bpf -mcpu=v3 < %s 2> %t1
+; RUN: FileCheck %s < %t1
+; CHECK: error: <unknown>:0:0: in function foo i64 (i32, i32, i32, i32, [2 x i64]): aggregate argument is split between registers and stack
+
+; Source code:
+;   struct t { long a; long b; };
+;
+;   long foo(int a1, int a2, int a3, int a4, struct t a5) {
+;     return a1 + a2 + a3 + a4 + a5.a + a5.b;
+;   }
+
+define dso_local i64 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, [2 x i64] %4) local_unnamed_addr {
+  %6 = extractvalue [2 x i64] %4, 0
+  %7 = extractvalue [2 x i64] %4, 1
+  %8 = add nsw i32 %1, %0
+  %9 = add nsw i32 %8, %2
+  %10 = add nsw i32 %9, %3
+  %11 = sext i32 %10 to i64
+  %12 = add nsw i64 %6, %11
+  %13 = add nsw i64 %12, %7
+  ret i64 %13
+}
diff --git a/llvm/test/CodeGen/BPF/many_args6.ll b/llvm/test/CodeGen/BPF/many_args6.ll
new file mode 100644
index 0000000000000..6e1c11b30ed2d
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/many_args6.ll
@@ -0,0 +1,23 @@
+; RUN: not llc -mtriple=bpf -mcpu=v3 < %s 2> %t1
+; RUN: FileCheck %s < %t1
+; CHECK: error: <unknown>:0:0: in function foo i64 (i32, i32, i32): {{(0x[0-9a-fA-F]+|t[0-9]+)}}: i64 = GlobalAddress<ptr @bar> 0 aggregate argument is split between registers and stack
+
+; Source code:
+;   struct t { long a; long b; };
+;
+;   long bar(int a1, int a2, int a3, int a4, struct t a5);
+;   long foo(int a1, int a2, int a3) {
+;     struct t tmp = {a1, a2};
+;     return bar(a1, a2, a3, a2, tmp);
+;   }
+
+define dso_local i64 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2) local_unnamed_addr {
+  %4 = sext i32 %0 to i64
+  %5 = sext i32 %1 to i64
+  %6 = insertvalue [2 x i64] poison, i64 %4, 0
+  %7 = insertvalue [2 x i64] %6, i64 %5, 1
+  %8 = tail call i64 @bar(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %1, [2 x i64] %7)
+  ret i64 %8
+}
+
+declare dso_local i64 @bar(i32 noundef, i32 noundef, i32 noundef, i32 noundef, [2 x i64]) local_unnamed_addr
diff --git a/llvm/test/CodeGen/BPF/many_args7.ll b/llvm/test/CodeGen/BPF/many_args7.ll
new file mode 100644
index 0000000000000..dcb543886620b
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/many_args7.ll
@@ -0,0 +1,36 @@
+; RUN: not llc -mtriple=bpf -mcpu=v3 < %s 2> %t1
+; RUN: FileCheck %s < %t1
+; CHECK: error: <unknown>:0:0: in function foo i64 (i32, i32, i32): {{(0x[0-9a-fA-F]+|t[0-9]+)}}: i64 = GlobalAddress<ptr @bar> 0 pass by value not supported
+
+; Source code:
+;   struct t { long a; long b; long c;};
+;
+;   long bar(int a1, int a2, int a3, int a4, int a5, struct t a6);
+;   long foo(int a1, int a2, int a3) {
+;     struct t tmp = {a1, a2, a3};
+;     return bar(a1, a2, a3, a2, a1, tmp);
+;   }
+
+%struct.t = type { i64, i64, i64 }
+
+define dso_local i64 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2) local_unnamed_addr {
+  %4 = alloca %struct.t, align 8
+  call void @llvm.lifetime.start.p0(ptr nonnull %4)
+  %5 = sext i32 %0 to i64
+  store i64 %5, ptr %4, align 8
+  %6 = getelementptr inbounds nuw i8, ptr %4, i64 8
+  %7 = sext i32 %1 to i64
+  store i64 %7, ptr %6, align 8
+  %8 = getelementptr inbounds nuw i8, ptr %4, i64 16
+  %9 = sext i32 %2 to i64
+  store i64 %9, ptr %8, align 8
+  %10 = tail call i64 @bar(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %1, i32 noundef %0, ptr noundef nonnull byval(%struct.t) align 8 %4)
+  call void @llvm.lifetime.end.p0(ptr nonnull %4)
+  ret i64 %10
+}
+
+declare void @llvm.lifetime.start.p0(ptr captures(none))
+
+declare dso_local i64 @bar(i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, ptr noundef byval(%struct.t) align 8) local_unnamed_addr
+
+declare void @llvm.lifetime.end.p0(ptr captures(none))



More information about the cfe-commits mailing list