[clang] [llvm] [AIX] Add -msave-reg-params to save arguments to stack (PR #97524)

Qiu Chaofan via cfe-commits cfe-commits at lists.llvm.org
Mon Jul 8 00:53:52 PDT 2024


https://github.com/ecnelises updated https://github.com/llvm/llvm-project/pull/97524

>From 654cf7753023302c367340872e889856f8738169 Mon Sep 17 00:00:00 2001
From: Qiu Chaofan <qiucofan at cn.ibm.com>
Date: Wed, 3 Jul 2024 14:17:01 +0800
Subject: [PATCH 1/2] [AIX] Add -msave-reg-params to save arguments to stack

In PowerPC ABI, a few initial arguments are passed through registers,
but their places in parameter save area are reserved, arguments passed
by memory goes after the reserved location.

For debugging purpose, we may want to save copy of the pass-by-reg
arguments into correct places on stack. The new option achieves by
adding new function level attribute and make argument lowering part
aware of it.
---
 clang/include/clang/Basic/CodeGenOptions.def  |   3 +
 clang/include/clang/Driver/Options.td         |   4 +
 clang/lib/CodeGen/CGCall.cpp                  |   3 +
 clang/lib/Driver/ToolChains/AIX.cpp           |   3 +
 clang/test/CodeGen/PowerPC/save-reg-params.c  |  12 +
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp     |  20 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |  29 +
 .../Target/PowerPC/PPCMachineFunctionInfo.h   |   6 +
 llvm/test/CodeGen/PowerPC/save-reg-params.ll  | 816 ++++++++++++++++++
 9 files changed, 888 insertions(+), 8 deletions(-)
 create mode 100644 clang/test/CodeGen/PowerPC/save-reg-params.c
 create mode 100644 llvm/test/CodeGen/PowerPC/save-reg-params.ll

diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index e3f6da4a84f694..fcf15aa9c400ac 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -425,6 +425,9 @@ CODEGENOPT(ForceAAPCSBitfieldLoad, 1, 0)
 /// Assume that by-value parameters do not alias any other values.
 CODEGENOPT(PassByValueIsNoAlias, 1, 0)
 
+/// Whether to store register parameters to stack.
+CODEGENOPT(SaveRegParams, 1, 0)
+
 /// Whether to not follow the AAPCS that enforces volatile bit-field access width to be
 /// according to the field declaring type width.
 CODEGENOPT(AAPCSBitfieldWidth, 1, 1)
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 1c2b8cfeef6ce6..4135f0db604509 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5040,6 +5040,10 @@ def mspe : Flag<["-"], "mspe">, Group<m_ppc_Features_Group>;
 def mno_spe : Flag<["-"], "mno-spe">, Group<m_ppc_Features_Group>;
 def mefpu2 : Flag<["-"], "mefpu2">, Group<m_ppc_Features_Group>;
 } // let Flags = [TargetSpecific]
+def msave_reg_params : Flag<["-"], "msave-reg-params">, Group<m_Group>,
+  Visibility<[ClangOption, CC1Option]>,
+  HelpText<"Save arguments passed by registers to stack">,
+  MarshallingInfoFlag<CodeGenOpts<"SaveRegParams">>;
 def mabi_EQ_quadword_atomics : Flag<["-"], "mabi=quadword-atomics">,
   Group<m_Group>, Visibility<[ClangOption, CC1Option]>,
   HelpText<"Enable quadword atomics ABI on AIX (AIX PPC64 only). Uses lqarx/stqcx. instructions.">,
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 2b301130ef7b70..8269755cdbf89d 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1931,6 +1931,9 @@ static void getTrivialDefaultFunctionAttributes(
     if (CodeGenOpts.NullPointerIsValid)
       FuncAttrs.addAttribute(llvm::Attribute::NullPointerIsValid);
 
+    if (CodeGenOpts.SaveRegParams)
+      FuncAttrs.addAttribute("save-reg-params");
+
     if (LangOpts.getDefaultExceptionMode() == LangOptions::FPE_Ignore)
       FuncAttrs.addAttribute("no-trapping-math", "true");
 
diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
index b04502a57a9f7a..c2de7328c25c5d 100644
--- a/clang/lib/Driver/ToolChains/AIX.cpp
+++ b/clang/lib/Driver/ToolChains/AIX.cpp
@@ -548,6 +548,9 @@ void AIX::addClangTargetOptions(
                   options::OPT_mtocdata))
     addTocDataOptions(Args, CC1Args, getDriver());
 
+  if (Args.hasArg(options::OPT_msave_reg_params))
+    CC1Args.push_back("-msave-reg-params");
+
   if (Args.hasFlag(options::OPT_fxl_pragma_pack,
                    options::OPT_fno_xl_pragma_pack, true))
     CC1Args.push_back("-fxl-pragma-pack");
diff --git a/clang/test/CodeGen/PowerPC/save-reg-params.c b/clang/test/CodeGen/PowerPC/save-reg-params.c
new file mode 100644
index 00000000000000..6599310afa41a3
--- /dev/null
+++ b/clang/test/CodeGen/PowerPC/save-reg-params.c
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -triple powerpc64le-unknown-linux-gnu -emit-llvm -o - %s -msave-reg-params | FileCheck -check-prefix=SAVE %s
+// RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -o - %s -msave-reg-params | FileCheck -check-prefix=SAVE %s
+// RUN: %clang_cc1 -triple powerpc-ibm-aix -emit-llvm -o - %s -msave-reg-params | FileCheck -check-prefix=SAVE %s
+// RUN: %clang_cc1 -triple powerpc64le-unknown-linux-gnu -emit-llvm -o - %s | FileCheck -check-prefix=NOSAVE %s
+// RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -o - %s | FileCheck -check-prefix=NOSAVE %s
+// RUN: %clang_cc1 -triple powerpc-ibm-aix -emit-llvm -o - %s | FileCheck -check-prefix=NOSAVE %s
+
+void bar(int);
+void foo(int x) { bar(x); }
+
+// SAVE: attributes #{{[0-9]+}} = { {{.+}} "save-reg-params" {{.+}} }
+// NOSAVE-NOT: "save-reg-params"···
\ No newline at end of file
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index d74143b487880a..085a67e9194215 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -2496,15 +2496,19 @@ void PPCAIXAsmPrinter::emitTracebackTable() {
 
   uint32_t GPRSaved = 0;
 
-  // X13 is reserved under 64-bit environment.
-  unsigned GPRBegin = Subtarget->isPPC64() ? PPC::X14 : PPC::R13;
-  unsigned GPREnd = Subtarget->isPPC64() ? PPC::X31 : PPC::R31;
-
-  for (unsigned Reg = GPRBegin; Reg <= GPREnd; ++Reg) {
-    if (MRI.isPhysRegModified(Reg)) {
-      GPRSaved = GPREnd - Reg + 1;
-      break;
+  if (FI->getForceGPRSaveCount() < 0) {
+    // X13 is reserved under 64-bit environment.
+    unsigned GPRBegin = Subtarget->isPPC64() ? PPC::X14 : PPC::R13;
+    unsigned GPREnd = Subtarget->isPPC64() ? PPC::X31 : PPC::R31;
+
+    for (unsigned Reg = GPRBegin; Reg <= GPREnd; ++Reg) {
+      if (MRI.isPhysRegModified(Reg)) {
+        GPRSaved = GPREnd - Reg + 1;
+        break;
+      }
     }
+  } else {
+    GPRSaved = FI->getForceGPRSaveCount();
   }
 
   SecondHalfOfMandatoryField |= (GPRSaved << TracebackTable::GPRSavedShift) &
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 4d4008ac0ba706..9b5d448d0ce448 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -7224,6 +7224,8 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
   // Reserve space for the linkage area on the stack.
   const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
   CCInfo.AllocateStack(LinkageSize, Align(PtrByteSize));
+  uint64_t SaveStackPos = CCInfo.getStackSize();
+  bool SaveParams = MF.getFunction().hasFnAttribute("save-reg-params");
   CCInfo.AnalyzeFormalArguments(Ins, CC_AIX);
 
   SmallVector<SDValue, 8> MemOps;
@@ -7242,6 +7244,28 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
     if (VA.isMemLoc() && VA.needsCustom() && ValVT.isFloatingPoint())
       continue;
 
+    if (SaveParams && VA.isRegLoc() && !Flags.isByVal()) {
+      const TargetRegisterClass *RegClass = getRegClassForSVT(
+          LocVT.SimpleTy, IsPPC64, Subtarget.hasP8Vector(), Subtarget.hasVSX());
+      // On PPC64, we need to use std instead of stw for GPR.
+      MVT SaveVT = RegClass == &PPC::G8RCRegClass ? MVT::i64 : LocVT;
+      const Register VReg = MF.addLiveIn(VA.getLocReg(), RegClass);
+      SDValue Parm = DAG.getRegister(VReg, SaveVT);
+      int FI = MFI.CreateFixedObject(SaveVT.getStoreSize(), SaveStackPos, true);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      SDValue StoreReg = DAG.getStore(Chain, dl, Parm, FIN,
+                                      MachinePointerInfo(), Align(PtrByteSize));
+      SaveStackPos = alignTo(SaveStackPos + SaveVT.getStoreSize(), PtrByteSize);
+      MemOps.push_back(StoreReg);
+      Chain = StoreReg;
+    }
+
+    if (SaveParams && (VA.isMemLoc() || Flags.isByVal())) {
+      unsigned StoreSize =
+          Flags.isByVal() ? Flags.getByValSize() : LocVT.getStoreSize();
+      SaveStackPos = alignTo(SaveStackPos + StoreSize, PtrByteSize);
+    }
+
     auto HandleMemLoc = [&]() {
       const unsigned LocSize = LocVT.getStoreSize();
       const unsigned ValSize = ValVT.getStoreSize();
@@ -7454,6 +7478,11 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
   FuncInfo->setMinReservedArea(CallerReservedArea);
 
   if (isVarArg) {
+    // Maximum number of saved GPR in traceback table is 8, for varargs,
+    // assuming eight GPRs matches XL behavior.
+    if (SaveParams)
+      FuncInfo->setForceGPRSaveCount(8);
+
     FuncInfo->setVarArgsFrameIndex(
         MFI.CreateFixedObject(PtrByteSize, CCInfo.getStackSize(), true));
     SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
diff --git a/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index b7d14da05ee248..a9e7d63237c7bc 100644
--- a/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -150,6 +150,9 @@ class PPCFunctionInfo : public MachineFunctionInfo {
   /// to use SExt/ZExt flags in later optimization.
   std::vector<std::pair<Register, ISD::ArgFlagsTy>> LiveInAttrs;
 
+  /// Set a fixed number of saved GPRs, negative if it's non-fixed.
+  int ForceGPRSaveCount = -1;
+
   /// Flags for aix-shared-lib-tls-model-opt, will be lazily initialized for
   /// each function.
   bool AIXFuncUseTLSIEForLD = false;
@@ -163,6 +166,9 @@ class PPCFunctionInfo : public MachineFunctionInfo {
         const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
       const override;
 
+  int getForceGPRSaveCount() const { return ForceGPRSaveCount; }
+  void setForceGPRSaveCount(int Num) { ForceGPRSaveCount = Num; }
+
   int getFramePointerSaveIndex() const { return FramePointerSaveIndex; }
   void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; }
 
diff --git a/llvm/test/CodeGen/PowerPC/save-reg-params.ll b/llvm/test/CodeGen/PowerPC/save-reg-params.ll
new file mode 100644
index 00000000000000..ad29a202db115f
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/save-reg-params.ll
@@ -0,0 +1,816 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -verify-machineinstrs -mtriple=powerpc-ibm-aix -mcpu=pwr7 < %s | FileCheck %s -check-prefix=32BIT
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-ibm-aix -mcpu=pwr7 < %s | FileCheck %s -check-prefix=64BIT
+
+define void @i64_join(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h, i64 %i, i64 %j) #0 {
+; 32BIT-LABEL: i64_join:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    stw 10, 116(1)
+; 32BIT-NEXT:    stw 9, 112(1)
+; 32BIT-NEXT:    stw 8, 108(1)
+; 32BIT-NEXT:    stw 7, 104(1)
+; 32BIT-NEXT:    stw 6, 100(1)
+; 32BIT-NEXT:    stw 5, 96(1)
+; 32BIT-NEXT:    stw 4, 92(1)
+; 32BIT-NEXT:    stw 3, 88(1)
+; 32BIT-NEXT:    bl .foo[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: i64_join:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    std 10, 216(1)
+; 64BIT-NEXT:    std 9, 208(1)
+; 64BIT-NEXT:    std 8, 200(1)
+; 64BIT-NEXT:    std 7, 192(1)
+; 64BIT-NEXT:    std 6, 184(1)
+; 64BIT-NEXT:    std 5, 176(1)
+; 64BIT-NEXT:    std 4, 168(1)
+; 64BIT-NEXT:    std 3, 160(1)
+; 64BIT-NEXT:    bl .foo[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+entry:
+  %add = add nsw i64 %b, %a
+  %add1 = add nsw i64 %add, %c
+  %add2 = add nsw i64 %add1, %d
+  %add3 = add nsw i64 %add2, %e
+  %add4 = add nsw i64 %add3, %f
+  %add5 = add nsw i64 %add4, %g
+  %add6 = add nsw i64 %add5, %h
+  %add7 = add nsw i64 %add6, %i
+  %add8 = add nsw i64 %add7, %j
+  tail call void @foo()
+  ret void
+}
+
+define void @i64_join_missing(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h, i64 %i, i64 %j) #0 {
+; 32BIT-LABEL: i64_join_missing:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    stw 10, 116(1)
+; 32BIT-NEXT:    stw 9, 112(1)
+; 32BIT-NEXT:    stw 8, 108(1)
+; 32BIT-NEXT:    stw 7, 104(1)
+; 32BIT-NEXT:    stw 6, 100(1)
+; 32BIT-NEXT:    stw 5, 96(1)
+; 32BIT-NEXT:    stw 4, 92(1)
+; 32BIT-NEXT:    stw 3, 88(1)
+; 32BIT-NEXT:    bl .foo[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: i64_join_missing:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    std 10, 216(1)
+; 64BIT-NEXT:    std 9, 208(1)
+; 64BIT-NEXT:    std 8, 200(1)
+; 64BIT-NEXT:    std 7, 192(1)
+; 64BIT-NEXT:    std 6, 184(1)
+; 64BIT-NEXT:    std 5, 176(1)
+; 64BIT-NEXT:    std 4, 168(1)
+; 64BIT-NEXT:    std 3, 160(1)
+; 64BIT-NEXT:    bl .foo[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+entry:
+  %add1 = mul nsw i64 %a, 3
+  %add2 = add nsw i64 %add1, %d
+  %add3 = add nsw i64 %add2, %e
+  %add4 = add nsw i64 %add3, %f
+  %add5 = add nsw i64 %add4, %g
+  %add6 = add nsw i64 %add5, %h
+  %add7 = add nsw i64 %add6, %i
+  %add8 = add nsw i64 %add7, %j
+  tail call void @foo()
+  ret void
+}
+
+define void @i32_join(i32 signext %a, i32 signext %b, i32 signext %c, i32 signext %d, i32 signext %e, i32 signext %f, i32 signext %g, i32 signext %h, i32 signext %i, i32 signext %j) #0 {
+; 32BIT-LABEL: i32_join:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    stw 10, 116(1)
+; 32BIT-NEXT:    stw 9, 112(1)
+; 32BIT-NEXT:    stw 8, 108(1)
+; 32BIT-NEXT:    stw 7, 104(1)
+; 32BIT-NEXT:    stw 6, 100(1)
+; 32BIT-NEXT:    stw 5, 96(1)
+; 32BIT-NEXT:    stw 4, 92(1)
+; 32BIT-NEXT:    stw 3, 88(1)
+; 32BIT-NEXT:    bl .foo[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: i32_join:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    std 10, 216(1)
+; 64BIT-NEXT:    std 9, 208(1)
+; 64BIT-NEXT:    std 8, 200(1)
+; 64BIT-NEXT:    std 7, 192(1)
+; 64BIT-NEXT:    std 6, 184(1)
+; 64BIT-NEXT:    std 5, 176(1)
+; 64BIT-NEXT:    std 4, 168(1)
+; 64BIT-NEXT:    std 3, 160(1)
+; 64BIT-NEXT:    bl .foo[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+entry:
+  %add = add nsw i32 %b, %a
+  %add1 = add nsw i32 %add, %c
+  %add2 = add nsw i32 %add1, %d
+  %add3 = add nsw i32 %add2, %e
+  %add4 = add nsw i32 %add3, %f
+  %add5 = add nsw i32 %add4, %g
+  %add6 = add nsw i32 %add5, %h
+  %add7 = add nsw i32 %add6, %i
+  %add8 = add nsw i32 %add7, %j
+  tail call void @foo()
+  ret void
+}
+
+define void @i32_join_missing(i32 signext %a, i32 signext %b, i32 signext %c, i32 signext %d, i32 signext %e, i32 signext %f, i32 signext %g, i32 signext %h, i32 signext %i, i32 signext %j) #0 {
+; 32BIT-LABEL: i32_join_missing:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    stw 10, 116(1)
+; 32BIT-NEXT:    stw 9, 112(1)
+; 32BIT-NEXT:    stw 8, 108(1)
+; 32BIT-NEXT:    stw 7, 104(1)
+; 32BIT-NEXT:    stw 6, 100(1)
+; 32BIT-NEXT:    stw 5, 96(1)
+; 32BIT-NEXT:    stw 4, 92(1)
+; 32BIT-NEXT:    stw 3, 88(1)
+; 32BIT-NEXT:    bl .foo[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: i32_join_missing:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    std 10, 216(1)
+; 64BIT-NEXT:    std 9, 208(1)
+; 64BIT-NEXT:    std 8, 200(1)
+; 64BIT-NEXT:    std 7, 192(1)
+; 64BIT-NEXT:    std 6, 184(1)
+; 64BIT-NEXT:    std 5, 176(1)
+; 64BIT-NEXT:    std 4, 168(1)
+; 64BIT-NEXT:    std 3, 160(1)
+; 64BIT-NEXT:    bl .foo[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+entry:
+  %add1 = mul nsw i32 %a, 3
+  %add2 = add nsw i32 %add1, %d
+  %add3 = add nsw i32 %add2, %e
+  %add4 = add nsw i32 %add3, %f
+  %add5 = add nsw i32 %add4, %g
+  %add6 = add nsw i32 %add5, %h
+  %add7 = add nsw i32 %add6, %i
+  %add8 = add nsw i32 %add7, %j
+  tail call void @foo()
+  ret void
+}
+
+define void @f32_join(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i, float %j) #0 {
+; 32BIT-LABEL: f32_join:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    stfs 10, 124(1)
+; 32BIT-NEXT:    stfs 9, 120(1)
+; 32BIT-NEXT:    stfs 8, 116(1)
+; 32BIT-NEXT:    stfs 7, 112(1)
+; 32BIT-NEXT:    stfs 6, 108(1)
+; 32BIT-NEXT:    stfs 5, 104(1)
+; 32BIT-NEXT:    stfs 4, 100(1)
+; 32BIT-NEXT:    stfs 3, 96(1)
+; 32BIT-NEXT:    stfs 2, 92(1)
+; 32BIT-NEXT:    stfs 1, 88(1)
+; 32BIT-NEXT:    bl .foo[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: f32_join:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    stfs 10, 232(1)
+; 64BIT-NEXT:    stfs 9, 224(1)
+; 64BIT-NEXT:    stfs 8, 216(1)
+; 64BIT-NEXT:    stfs 7, 208(1)
+; 64BIT-NEXT:    stfs 6, 200(1)
+; 64BIT-NEXT:    stfs 5, 192(1)
+; 64BIT-NEXT:    stfs 4, 184(1)
+; 64BIT-NEXT:    stfs 3, 176(1)
+; 64BIT-NEXT:    stfs 2, 168(1)
+; 64BIT-NEXT:    stfs 1, 160(1)
+; 64BIT-NEXT:    bl .foo[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+entry:
+  %add = fadd float %a, %b
+  %add1 = fadd float %add, %c
+  %add2 = fadd float %add1, %d
+  %add3 = fadd float %add2, %e
+  %add4 = fadd float %add3, %f
+  %add5 = fadd float %add4, %g
+  %add6 = fadd float %add5, %h
+  %add7 = fadd float %add6, %i
+  %add8 = fadd float %add7, %j
+  tail call void @foo()
+  ret void
+}
+
+define void @f32_join_missing(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i, float %j) #0 {
+; 32BIT-LABEL: f32_join_missing:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    stfs 10, 124(1)
+; 32BIT-NEXT:    stfs 9, 120(1)
+; 32BIT-NEXT:    stfs 8, 116(1)
+; 32BIT-NEXT:    stfs 7, 112(1)
+; 32BIT-NEXT:    stfs 6, 108(1)
+; 32BIT-NEXT:    stfs 5, 104(1)
+; 32BIT-NEXT:    stfs 4, 100(1)
+; 32BIT-NEXT:    stfs 3, 96(1)
+; 32BIT-NEXT:    stfs 2, 92(1)
+; 32BIT-NEXT:    stfs 1, 88(1)
+; 32BIT-NEXT:    bl .foo[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: f32_join_missing:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    stfs 10, 232(1)
+; 64BIT-NEXT:    stfs 9, 224(1)
+; 64BIT-NEXT:    stfs 8, 216(1)
+; 64BIT-NEXT:    stfs 7, 208(1)
+; 64BIT-NEXT:    stfs 6, 200(1)
+; 64BIT-NEXT:    stfs 5, 192(1)
+; 64BIT-NEXT:    stfs 4, 184(1)
+; 64BIT-NEXT:    stfs 3, 176(1)
+; 64BIT-NEXT:    stfs 2, 168(1)
+; 64BIT-NEXT:    stfs 1, 160(1)
+; 64BIT-NEXT:    bl .foo[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+entry:
+  %add = fadd float %a, %a
+  %add1 = fadd float %add, %a
+  %add2 = fadd float %add1, %d
+  %add3 = fadd float %add2, %e
+  %add4 = fadd float %add3, %f
+  %add5 = fadd float %add4, %g
+  %add6 = fadd float %add5, %h
+  %add7 = fadd float %add6, %i
+  %add8 = fadd float %add7, %j
+  tail call void @foo()
+  ret void
+}
+
+define void @f64_join(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i, double %j) #0 {
+; 32BIT-LABEL: f64_join:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    stfd 10, 160(1)
+; 32BIT-NEXT:    stfd 9, 152(1)
+; 32BIT-NEXT:    stfd 8, 144(1)
+; 32BIT-NEXT:    stfd 7, 136(1)
+; 32BIT-NEXT:    stfd 6, 128(1)
+; 32BIT-NEXT:    stfd 5, 120(1)
+; 32BIT-NEXT:    stfd 4, 112(1)
+; 32BIT-NEXT:    stfd 3, 104(1)
+; 32BIT-NEXT:    stfd 2, 96(1)
+; 32BIT-NEXT:    stfd 1, 88(1)
+; 32BIT-NEXT:    bl .foo[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: f64_join:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    stfd 10, 232(1)
+; 64BIT-NEXT:    stfd 9, 224(1)
+; 64BIT-NEXT:    stfd 8, 216(1)
+; 64BIT-NEXT:    stfd 7, 208(1)
+; 64BIT-NEXT:    stfd 6, 200(1)
+; 64BIT-NEXT:    stfd 5, 192(1)
+; 64BIT-NEXT:    stfd 4, 184(1)
+; 64BIT-NEXT:    stfd 3, 176(1)
+; 64BIT-NEXT:    stfd 2, 168(1)
+; 64BIT-NEXT:    stfd 1, 160(1)
+; 64BIT-NEXT:    bl .foo[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+entry:
+  %add = fadd double %a, %b
+  %add1 = fadd double %add, %c
+  %add2 = fadd double %add1, %d
+  %add3 = fadd double %add2, %e
+  %add4 = fadd double %add3, %f
+  %add5 = fadd double %add4, %g
+  %add6 = fadd double %add5, %h
+  %add7 = fadd double %add6, %i
+  %add8 = fadd double %add7, %j
+  tail call void @foo()
+  ret void
+}
+
+define void @f64_missing(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i, double %j) #0 {
+; 32BIT-LABEL: f64_missing:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    stfd 10, 160(1)
+; 32BIT-NEXT:    stfd 9, 152(1)
+; 32BIT-NEXT:    stfd 8, 144(1)
+; 32BIT-NEXT:    stfd 7, 136(1)
+; 32BIT-NEXT:    stfd 6, 128(1)
+; 32BIT-NEXT:    stfd 5, 120(1)
+; 32BIT-NEXT:    stfd 4, 112(1)
+; 32BIT-NEXT:    stfd 3, 104(1)
+; 32BIT-NEXT:    stfd 2, 96(1)
+; 32BIT-NEXT:    stfd 1, 88(1)
+; 32BIT-NEXT:    bl .foo[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: f64_missing:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    stfd 10, 232(1)
+; 64BIT-NEXT:    stfd 9, 224(1)
+; 64BIT-NEXT:    stfd 8, 216(1)
+; 64BIT-NEXT:    stfd 7, 208(1)
+; 64BIT-NEXT:    stfd 6, 200(1)
+; 64BIT-NEXT:    stfd 5, 192(1)
+; 64BIT-NEXT:    stfd 4, 184(1)
+; 64BIT-NEXT:    stfd 3, 176(1)
+; 64BIT-NEXT:    stfd 2, 168(1)
+; 64BIT-NEXT:    stfd 1, 160(1)
+; 64BIT-NEXT:    bl .foo[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+entry:
+  %add = fadd double %a, %a
+  %add1 = fadd double %add, %a
+  %add2 = fadd double %add1, %d
+  %add3 = fadd double %add2, %e
+  %add4 = fadd double %add3, %f
+  %add5 = fadd double %add4, %g
+  %add6 = fadd double %add5, %h
+  %add7 = fadd double %add6, %i
+  %add8 = fadd double %add7, %j
+  tail call void @foo()
+  ret void
+}
+
+define void @mixed_1(double %a, i64 %b, i64 %c, i32 signext %d, i64 %e, float %f, float %g, double %h, i32 signext %i, double %j) #0 {
+; 32BIT-LABEL: mixed_1:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -112(1)
+; 32BIT-NEXT:    stw 0, 120(1)
+; 32BIT-NEXT:    stfd 1, 136(1)
+; 32BIT-NEXT:    xsadddp 1, 1, 5
+; 32BIT-NEXT:    stw 24, 64(1) # 4-byte Folded Spill
+; 32BIT-NEXT:    lwz 24, 168(1)
+; 32BIT-NEXT:    stw 25, 68(1) # 4-byte Folded Spill
+; 32BIT-NEXT:    lwz 25, 188(1)
+; 32BIT-NEXT:    stw 26, 72(1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stw 27, 76(1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stw 28, 80(1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stw 29, 84(1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stw 30, 88(1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stw 31, 92(1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stfd 30, 96(1) # 8-byte Folded Spill
+; 32BIT-NEXT:    stfd 31, 104(1) # 8-byte Folded Spill
+; 32BIT-NEXT:    fmr 31, 3
+; 32BIT-NEXT:    fmr 30, 2
+; 32BIT-NEXT:    mr 31, 10
+; 32BIT-NEXT:    mr 30, 9
+; 32BIT-NEXT:    mr 29, 8
+; 32BIT-NEXT:    mr 28, 7
+; 32BIT-NEXT:    mr 27, 6
+; 32BIT-NEXT:    mr 26, 5
+; 32BIT-NEXT:    stw 5, 144(1)
+; 32BIT-NEXT:    stw 6, 148(1)
+; 32BIT-NEXT:    stw 7, 152(1)
+; 32BIT-NEXT:    stw 8, 156(1)
+; 32BIT-NEXT:    stw 9, 160(1)
+; 32BIT-NEXT:    stw 10, 164(1)
+; 32BIT-NEXT:    stfs 2, 172(1)
+; 32BIT-NEXT:    stfs 3, 176(1)
+; 32BIT-NEXT:    stfd 4, 180(1)
+; 32BIT-NEXT:    stfd 5, 192(1)
+; 32BIT-NEXT:    bl .consume_f64[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    fadds 1, 30, 31
+; 32BIT-NEXT:    bl .consume_f32[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addc 3, 29, 27
+; 32BIT-NEXT:    adde 4, 28, 26
+; 32BIT-NEXT:    srawi 5, 30, 31
+; 32BIT-NEXT:    addc 3, 3, 30
+; 32BIT-NEXT:    adde 5, 4, 5
+; 32BIT-NEXT:    addc 4, 3, 24
+; 32BIT-NEXT:    adde 3, 5, 31
+; 32BIT-NEXT:    bl .consume_i64[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    add 3, 25, 30
+; 32BIT-NEXT:    bl .consume_i32[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    lfd 31, 104(1) # 8-byte Folded Reload
+; 32BIT-NEXT:    lfd 30, 96(1) # 8-byte Folded Reload
+; 32BIT-NEXT:    lwz 31, 92(1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz 30, 88(1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz 29, 84(1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz 28, 80(1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz 27, 76(1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz 26, 72(1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz 25, 68(1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz 24, 64(1) # 4-byte Folded Reload
+; 32BIT-NEXT:    addi 1, 1, 112
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: mixed_1:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -176(1)
+; 64BIT-NEXT:    std 0, 192(1)
+; 64BIT-NEXT:    stfd 1, 224(1)
+; 64BIT-NEXT:    xsadddp 1, 1, 5
+; 64BIT-NEXT:    std 27, 120(1) # 8-byte Folded Spill
+; 64BIT-NEXT:    lwz 27, 292(1)
+; 64BIT-NEXT:    std 28, 128(1) # 8-byte Folded Spill
+; 64BIT-NEXT:    std 29, 136(1) # 8-byte Folded Spill
+; 64BIT-NEXT:    std 30, 144(1) # 8-byte Folded Spill
+; 64BIT-NEXT:    std 31, 152(1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd 30, 160(1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd 31, 168(1) # 8-byte Folded Spill
+; 64BIT-NEXT:    fmr 31, 3
+; 64BIT-NEXT:    fmr 30, 2
+; 64BIT-NEXT:    mr 31, 7
+; 64BIT-NEXT:    mr 30, 6
+; 64BIT-NEXT:    mr 29, 5
+; 64BIT-NEXT:    mr 28, 4
+; 64BIT-NEXT:    std 4, 232(1)
+; 64BIT-NEXT:    std 5, 240(1)
+; 64BIT-NEXT:    std 6, 248(1)
+; 64BIT-NEXT:    std 7, 256(1)
+; 64BIT-NEXT:    stfs 2, 264(1)
+; 64BIT-NEXT:    stfs 3, 272(1)
+; 64BIT-NEXT:    stfd 4, 280(1)
+; 64BIT-NEXT:    stfd 5, 296(1)
+; 64BIT-NEXT:    bl .consume_f64[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    fadds 1, 30, 31
+; 64BIT-NEXT:    bl .consume_f32[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    add 3, 29, 28
+; 64BIT-NEXT:    add 3, 3, 30
+; 64BIT-NEXT:    add 3, 3, 31
+; 64BIT-NEXT:    bl .consume_i64[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    add 3, 27, 30
+; 64BIT-NEXT:    extsw 3, 3
+; 64BIT-NEXT:    bl .consume_i32[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    lfd 31, 168(1) # 8-byte Folded Reload
+; 64BIT-NEXT:    lfd 30, 160(1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld 31, 152(1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld 30, 144(1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld 29, 136(1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld 28, 128(1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld 27, 120(1) # 8-byte Folded Reload
+; 64BIT-NEXT:    addi 1, 1, 176
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+entry:
+  %add = fadd double %a, %j
+  tail call void @consume_f64(double %add)
+  %add1 = fadd float %f, %g
+  tail call void @consume_f32(float %add1)
+  %add2 = add nsw i64 %c, %b
+  %conv = sext i32 %d to i64
+  %add3 = add nsw i64 %add2, %conv
+  %add4 = add nsw i64 %add3, %e
+  tail call void @consume_i64(i64 %add4)
+  %add5 = add nsw i32 %i, %d
+  tail call void @consume_i32(i32 signext %add5)
+  ret void
+}
+
+define void @mixed_2(<2 x double> %a, <4 x i32> %b, i64 %c) #0 {
+; 32BIT-LABEL: mixed_2:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -96(1)
+; 32BIT-NEXT:    li 5, 64
+; 32BIT-NEXT:    stw 0, 104(1)
+; 32BIT-NEXT:    stw 3, 152(1)
+; 32BIT-NEXT:    stw 4, 156(1)
+; 32BIT-NEXT:    stxvd2x 34, 1, 5 # 16-byte Folded Spill
+; 32BIT-NEXT:    addi 5, 1, 120
+; 32BIT-NEXT:    stxvd2x 34, 0, 5
+; 32BIT-NEXT:    addi 5, 1, 136
+; 32BIT-NEXT:    stxvw4x 35, 0, 5
+; 32BIT-NEXT:    addi 5, 1, 80
+; 32BIT-NEXT:    stxvw4x 35, 0, 5
+; 32BIT-NEXT:    lwz 5, 80(1)
+; 32BIT-NEXT:    srawi 6, 5, 31
+; 32BIT-NEXT:    addc 4, 5, 4
+; 32BIT-NEXT:    adde 3, 6, 3
+; 32BIT-NEXT:    bl .consume_i64[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    li 3, 64
+; 32BIT-NEXT:    lxvd2x 1, 1, 3 # 16-byte Folded Reload
+; 32BIT-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; 32BIT-NEXT:    bl .consume_f64[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 96
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: mixed_2:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -160(1)
+; 64BIT-NEXT:    li 4, 128
+; 64BIT-NEXT:    std 0, 176(1)
+; 64BIT-NEXT:    std 3, 240(1)
+; 64BIT-NEXT:    stxvd2x 34, 1, 4 # 16-byte Folded Spill
+; 64BIT-NEXT:    addi 4, 1, 208
+; 64BIT-NEXT:    stxvd2x 34, 0, 4
+; 64BIT-NEXT:    addi 4, 1, 224
+; 64BIT-NEXT:    stxvw4x 35, 0, 4
+; 64BIT-NEXT:    addi 4, 1, 144
+; 64BIT-NEXT:    stxvw4x 35, 0, 4
+; 64BIT-NEXT:    lwa 4, 144(1)
+; 64BIT-NEXT:    add 3, 4, 3
+; 64BIT-NEXT:    bl .consume_i64[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    li 3, 128
+; 64BIT-NEXT:    lxvd2x 1, 1, 3 # 16-byte Folded Reload
+; 64BIT-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; 64BIT-NEXT:    bl .consume_f64[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 160
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %b, i64 0
+  %conv = sext i32 %vecext to i64
+  %add = add nsw i64 %conv, %c
+  tail call void @consume_i64(i64 %add)
+  %vecext1 = extractelement <2 x double> %a, i64 0
+  tail call void @consume_f64(double %vecext1)
+  ret void
+}
+
+%struct.foo = type <{ [3 x i32], double, [12 x i8], <4 x i32> }>
+
+define void @mixed_3(<2 x double> %a, i64 %b, double %c, float %d, i32 signext %e, double %f, ...) #0 {
+; 32BIT-LABEL: mixed_3:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -80(1)
+; 32BIT-NEXT:    xsadddp 0, 34, 3
+; 32BIT-NEXT:    stw 0, 88(1)
+; 32BIT-NEXT:    stfd 1, 128(1)
+; 32BIT-NEXT:    stw 29, 60(1) # 4-byte Folded Spill
+; 32BIT-NEXT:    addi 3, 1, 104
+; 32BIT-NEXT:    lwz 29, 148(1)
+; 32BIT-NEXT:    stw 30, 64(1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stw 31, 68(1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stfd 31, 72(1) # 8-byte Folded Spill
+; 32BIT-NEXT:    fmr 31, 2
+; 32BIT-NEXT:    mr 31, 10
+; 32BIT-NEXT:    mr 30, 9
+; 32BIT-NEXT:    xsadddp 1, 0, 1
+; 32BIT-NEXT:    stxvd2x 34, 0, 3
+; 32BIT-NEXT:    stw 9, 120(1)
+; 32BIT-NEXT:    stw 10, 124(1)
+; 32BIT-NEXT:    stfs 2, 136(1)
+; 32BIT-NEXT:    stfd 3, 144(1)
+; 32BIT-NEXT:    bl .consume_f64[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    mr 3, 30
+; 32BIT-NEXT:    mr 4, 31
+; 32BIT-NEXT:    bl .consume_i64[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    fmr 1, 31
+; 32BIT-NEXT:    bl .consume_f32[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    mr 3, 29
+; 32BIT-NEXT:    bl .consume_i32[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
+; 32BIT-NEXT:    lwz 31, 68(1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz 30, 64(1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz 29, 60(1) # 4-byte Folded Reload
+; 32BIT-NEXT:    addi 1, 1, 80
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+; 32BIT: .byte 0x48 # -HasExtensionTable, +HasVectorInfo, NumOfGPRsSaved = 8
+;
+; 64BIT-LABEL: mixed_3:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -144(1)
+; 64BIT-NEXT:    xsadddp 0, 34, 3
+; 64BIT-NEXT:    std 0, 160(1)
+; 64BIT-NEXT:    stfd 1, 216(1)
+; 64BIT-NEXT:    addi 3, 1, 192
+; 64BIT-NEXT:    std 30, 120(1) # 8-byte Folded Spill
+; 64BIT-NEXT:    std 31, 128(1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd 31, 136(1) # 8-byte Folded Spill
+; 64BIT-NEXT:    mr 31, 8
+; 64BIT-NEXT:    fmr 31, 2
+; 64BIT-NEXT:    mr 30, 5
+; 64BIT-NEXT:    stxvd2x 34, 0, 3
+; 64BIT-NEXT:    xsadddp 1, 0, 1
+; 64BIT-NEXT:    std 5, 208(1)
+; 64BIT-NEXT:    stfs 2, 232(1)
+; 64BIT-NEXT:    std 6, 224(1)
+; 64BIT-NEXT:    std 8, 248(1)
+; 64BIT-NEXT:    std 7, 240(1)
+; 64BIT-NEXT:    stfd 3, 256(1)
+; 64BIT-NEXT:    std 9, 264(1)
+; 64BIT-NEXT:    std 10, 248(1)
+; 64BIT-NEXT:    bl .consume_f64[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    mr 3, 30
+; 64BIT-NEXT:    bl .consume_i64[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    fmr 1, 31
+; 64BIT-NEXT:    bl .consume_f32[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    mr 3, 31
+; 64BIT-NEXT:    bl .consume_i32[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    lfd 31, 136(1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld 31, 128(1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld 30, 120(1) # 8-byte Folded Reload
+; 64BIT-NEXT:    addi 1, 1, 144
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+; 64BIT: .byte 0x48 # -HasExtensionTable, +HasVectorInfo, NumOfGPRsSaved = 8
+entry:
+  %vecext = extractelement <2 x double> %a, i64 0
+  %add = fadd double %vecext, %f
+  %add1 = fadd double %add, %c
+  tail call void @consume_f64(double %add1)
+  tail call void @consume_i64(i64 %b)
+  tail call void @consume_f32(float %d)
+  tail call void @consume_i32(i32 signext %e)
+  ret void
+}
+
+define signext i32 @mixed_4(ptr byval(%struct.foo) align 16 %foo, i32 %sec) #0 {
+; 32BIT-LABEL: mixed_4:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    stw 9, 48(1)
+; 32BIT-NEXT:    stw 8, 44(1)
+; 32BIT-NEXT:    lfd 0, 44(1)
+; 32BIT-NEXT:    addi 3, 1, -4
+; 32BIT-NEXT:    xscvdpsxws 0, 0
+; 32BIT-NEXT:    stw 5, 32(1)
+; 32BIT-NEXT:    stw 6, 36(1)
+; 32BIT-NEXT:    stw 7, 40(1)
+; 32BIT-NEXT:    stw 10, 52(1)
+; 32BIT-NEXT:    stfiwx 0, 0, 3
+; 32BIT-NEXT:    lwz 3, -4(1)
+; 32BIT-NEXT:    lwz 4, 76(1)
+; 32BIT-NEXT:    add 3, 5, 3
+; 32BIT-NEXT:    add 3, 3, 4
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: mixed_4:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    std 5, 64(1)
+; 64BIT-NEXT:    std 4, 56(1)
+; 64BIT-NEXT:    lfd 0, 60(1)
+; 64BIT-NEXT:    addi 4, 1, -4
+; 64BIT-NEXT:    xscvdpsxws 0, 0
+; 64BIT-NEXT:    std 3, 48(1)
+; 64BIT-NEXT:    std 6, 72(1)
+; 64BIT-NEXT:    std 7, 80(1)
+; 64BIT-NEXT:    std 8, 88(1)
+; 64BIT-NEXT:    std 9, 96(1)
+; 64BIT-NEXT:    rldicl 3, 3, 32, 32
+; 64BIT-NEXT:    stfiwx 0, 0, 4
+; 64BIT-NEXT:    lwz 4, -4(1)
+; 64BIT-NEXT:    add 3, 3, 4
+; 64BIT-NEXT:    add 3, 3, 8
+; 64BIT-NEXT:    extsw 3, 3
+; 64BIT-NEXT:    blr
+entry:
+  %0 = load i32, ptr %foo, align 16
+  %x = getelementptr inbounds i8, ptr %foo, i64 12
+  %1 = load double, ptr %x, align 4
+  %conv = fptosi double %1 to i32
+  %add = add nsw i32 %0, %conv
+  %2 = getelementptr inbounds i8, ptr %foo, i64 44
+  %vecext = load i32, ptr %2, align 4
+  %add1 = add nsw i32 %add, %vecext
+  ret i32 %add1
+}
+
+declare void @foo() #0
+declare void @consume_f64(double) #0
+declare void @consume_f32(float) #0
+declare void @consume_i64(i64) #0
+declare void @consume_i32(i32 signext) #0
+
+attributes #0 = { "save-reg-params" nofree noinline nounwind }

>From a52e7d53fe073e65b24acee1fe18d97251600547 Mon Sep 17 00:00:00 2001
From: Qiu Chaofan <qiucofan at cn.ibm.com>
Date: Mon, 8 Jul 2024 15:51:55 +0800
Subject: [PATCH 2/2] Address comments

- Restrict the option to AIX
- In IR use module flag instead of function attribute
- Added more cases
- Simplify change to AsmPrinter
- Added cases for byref
---
 clang/include/clang/Driver/Options.td         |   3 +-
 clang/lib/CodeGen/CGCall.cpp                  |   3 -
 clang/lib/CodeGen/Targets/PPC.cpp             |  11 +
 clang/test/CodeGen/PowerPC/save-reg-params.c  |   6 +-
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp     |   9 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |  12 +-
 .../Target/PowerPC/PPCMachineFunctionInfo.h   |   6 -
 llvm/test/CodeGen/PowerPC/save-reg-params.ll  | 356 ++++++++++--------
 8 files changed, 233 insertions(+), 173 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 4135f0db604509..c0eb280a84ef89 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5041,8 +5041,9 @@ def mno_spe : Flag<["-"], "mno-spe">, Group<m_ppc_Features_Group>;
 def mefpu2 : Flag<["-"], "mefpu2">, Group<m_ppc_Features_Group>;
 } // let Flags = [TargetSpecific]
 def msave_reg_params : Flag<["-"], "msave-reg-params">, Group<m_Group>,
+  Flags<[TargetSpecific]>,
   Visibility<[ClangOption, CC1Option]>,
-  HelpText<"Save arguments passed by registers to stack">,
+  HelpText<"Save arguments passed by registers to ABI-defined stack positions">,
   MarshallingInfoFlag<CodeGenOpts<"SaveRegParams">>;
 def mabi_EQ_quadword_atomics : Flag<["-"], "mabi=quadword-atomics">,
   Group<m_Group>, Visibility<[ClangOption, CC1Option]>,
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 8269755cdbf89d..2b301130ef7b70 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1931,9 +1931,6 @@ static void getTrivialDefaultFunctionAttributes(
     if (CodeGenOpts.NullPointerIsValid)
       FuncAttrs.addAttribute(llvm::Attribute::NullPointerIsValid);
 
-    if (CodeGenOpts.SaveRegParams)
-      FuncAttrs.addAttribute("save-reg-params");
-
     if (LangOpts.getDefaultExceptionMode() == LangOptions::FPE_Ignore)
       FuncAttrs.addAttribute("no-trapping-math", "true");
 
diff --git a/clang/lib/CodeGen/Targets/PPC.cpp b/clang/lib/CodeGen/Targets/PPC.cpp
index e4155810963eb8..185f4802b11499 100644
--- a/clang/lib/CodeGen/Targets/PPC.cpp
+++ b/clang/lib/CodeGen/Targets/PPC.cpp
@@ -146,6 +146,10 @@ class AIXTargetCodeGenInfo : public TargetCodeGenInfo {
 
   void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
                            CodeGen::CodeGenModule &M) const override;
+
+  void emitTargetMetadata(CodeGen::CodeGenModule &CGM,
+                          const llvm::MapVector<GlobalDecl, StringRef>
+                              &MangledDeclNames) const override;
 };
 } // namespace
 
@@ -321,6 +325,13 @@ void AIXTargetCodeGenInfo::setTargetAttributes(
   }
 }
 
+void AIXTargetCodeGenInfo::emitTargetMetadata(
+    CodeGen::CodeGenModule &CGM,
+    const llvm::MapVector<GlobalDecl, StringRef> &MangledDeclNames) const {
+  if (CGM.getCodeGenOpts().SaveRegParams)
+    CGM.getModule().addModuleFlag(llvm::Module::Error, "save-reg-params", 1);
+}
+
 // PowerPC-32
 namespace {
 /// PPC32_SVR4_ABIInfo - The 32-bit PowerPC ELF (SVR4) ABI information.
diff --git a/clang/test/CodeGen/PowerPC/save-reg-params.c b/clang/test/CodeGen/PowerPC/save-reg-params.c
index 6599310afa41a3..7e98de22d3a519 100644
--- a/clang/test/CodeGen/PowerPC/save-reg-params.c
+++ b/clang/test/CodeGen/PowerPC/save-reg-params.c
@@ -1,12 +1,10 @@
-// RUN: %clang_cc1 -triple powerpc64le-unknown-linux-gnu -emit-llvm -o - %s -msave-reg-params | FileCheck -check-prefix=SAVE %s
 // RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -o - %s -msave-reg-params | FileCheck -check-prefix=SAVE %s
 // RUN: %clang_cc1 -triple powerpc-ibm-aix -emit-llvm -o - %s -msave-reg-params | FileCheck -check-prefix=SAVE %s
-// RUN: %clang_cc1 -triple powerpc64le-unknown-linux-gnu -emit-llvm -o - %s | FileCheck -check-prefix=NOSAVE %s
 // RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -o - %s | FileCheck -check-prefix=NOSAVE %s
 // RUN: %clang_cc1 -triple powerpc-ibm-aix -emit-llvm -o - %s | FileCheck -check-prefix=NOSAVE %s
 
 void bar(int);
 void foo(int x) { bar(x); }
 
-// SAVE: attributes #{{[0-9]+}} = { {{.+}} "save-reg-params" {{.+}} }
-// NOSAVE-NOT: "save-reg-params"···
\ No newline at end of file
+// SAVE: !{i32 1, !"save-reg-params", i32 1}
+// NOSAVE-NOT: !{i32 1, !"save-reg-params", i32 1}
\ No newline at end of file
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 085a67e9194215..2fa504218e3157 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -2496,7 +2496,12 @@ void PPCAIXAsmPrinter::emitTracebackTable() {
 
   uint32_t GPRSaved = 0;
 
-  if (FI->getForceGPRSaveCount() < 0) {
+  bool SaveParams =
+      MF->getFunction().getParent()->getModuleFlag("save-reg-params");
+  if (SaveParams) {
+    // Assuming eight GPRs matches XL behavior for varargs.
+    GPRSaved = 8;
+  } else {
     // X13 is reserved under 64-bit environment.
     unsigned GPRBegin = Subtarget->isPPC64() ? PPC::X14 : PPC::R13;
     unsigned GPREnd = Subtarget->isPPC64() ? PPC::X31 : PPC::R31;
@@ -2507,8 +2512,6 @@ void PPCAIXAsmPrinter::emitTracebackTable() {
         break;
       }
     }
-  } else {
-    GPRSaved = FI->getForceGPRSaveCount();
   }
 
   SecondHalfOfMandatoryField |= (GPRSaved << TracebackTable::GPRSavedShift) &
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 9b5d448d0ce448..e519551934b1e6 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -7225,7 +7225,8 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
   const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
   CCInfo.AllocateStack(LinkageSize, Align(PtrByteSize));
   uint64_t SaveStackPos = CCInfo.getStackSize();
-  bool SaveParams = MF.getFunction().hasFnAttribute("save-reg-params");
+  bool SaveParams =
+      MF.getFunction().getParent()->getModuleFlag("save-reg-params");
   CCInfo.AnalyzeFormalArguments(Ins, CC_AIX);
 
   SmallVector<SDValue, 8> MemOps;
@@ -7247,10 +7248,10 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
     if (SaveParams && VA.isRegLoc() && !Flags.isByVal()) {
       const TargetRegisterClass *RegClass = getRegClassForSVT(
           LocVT.SimpleTy, IsPPC64, Subtarget.hasP8Vector(), Subtarget.hasVSX());
-      // On PPC64, we need to use std instead of stw for GPR.
+      // On PPC64, debugger assumes extended 8-byte value are stored from GPR.
       MVT SaveVT = RegClass == &PPC::G8RCRegClass ? MVT::i64 : LocVT;
       const Register VReg = MF.addLiveIn(VA.getLocReg(), RegClass);
-      SDValue Parm = DAG.getRegister(VReg, SaveVT);
+      SDValue Parm = DAG.getCopyFromReg(Chain, dl, VReg, SaveVT);
       int FI = MFI.CreateFixedObject(SaveVT.getStoreSize(), SaveStackPos, true);
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
       SDValue StoreReg = DAG.getStore(Chain, dl, Parm, FIN,
@@ -7478,11 +7479,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
   FuncInfo->setMinReservedArea(CallerReservedArea);
 
   if (isVarArg) {
-    // Maximum number of saved GPR in traceback table is 8, for varargs,
-    // assuming eight GPRs matches XL behavior.
-    if (SaveParams)
-      FuncInfo->setForceGPRSaveCount(8);
-
     FuncInfo->setVarArgsFrameIndex(
         MFI.CreateFixedObject(PtrByteSize, CCInfo.getStackSize(), true));
     SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
diff --git a/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index a9e7d63237c7bc..b7d14da05ee248 100644
--- a/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -150,9 +150,6 @@ class PPCFunctionInfo : public MachineFunctionInfo {
   /// to use SExt/ZExt flags in later optimization.
   std::vector<std::pair<Register, ISD::ArgFlagsTy>> LiveInAttrs;
 
-  /// Set a fixed number of saved GPRs, negative if it's non-fixed.
-  int ForceGPRSaveCount = -1;
-
   /// Flags for aix-shared-lib-tls-model-opt, will be lazily initialized for
   /// each function.
   bool AIXFuncUseTLSIEForLD = false;
@@ -166,9 +163,6 @@ class PPCFunctionInfo : public MachineFunctionInfo {
         const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
       const override;
 
-  int getForceGPRSaveCount() const { return ForceGPRSaveCount; }
-  void setForceGPRSaveCount(int Num) { ForceGPRSaveCount = Num; }
-
   int getFramePointerSaveIndex() const { return FramePointerSaveIndex; }
   void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; }
 
diff --git a/llvm/test/CodeGen/PowerPC/save-reg-params.ll b/llvm/test/CodeGen/PowerPC/save-reg-params.ll
index ad29a202db115f..3b6cf694254096 100644
--- a/llvm/test/CodeGen/PowerPC/save-reg-params.ll
+++ b/llvm/test/CodeGen/PowerPC/save-reg-params.ll
@@ -2,20 +2,20 @@
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc-ibm-aix -mcpu=pwr7 < %s | FileCheck %s -check-prefix=32BIT
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-ibm-aix -mcpu=pwr7 < %s | FileCheck %s -check-prefix=64BIT
 
-define void @i64_join(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h, i64 %i, i64 %j) #0 {
+define void @i64_join(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h, i64 %i, i64 %j) {
 ; 32BIT-LABEL: i64_join:
 ; 32BIT:       # %bb.0: # %entry
 ; 32BIT-NEXT:    mflr 0
 ; 32BIT-NEXT:    stwu 1, -64(1)
 ; 32BIT-NEXT:    stw 0, 72(1)
-; 32BIT-NEXT:    stw 10, 116(1)
-; 32BIT-NEXT:    stw 9, 112(1)
-; 32BIT-NEXT:    stw 8, 108(1)
-; 32BIT-NEXT:    stw 7, 104(1)
-; 32BIT-NEXT:    stw 6, 100(1)
-; 32BIT-NEXT:    stw 5, 96(1)
-; 32BIT-NEXT:    stw 4, 92(1)
 ; 32BIT-NEXT:    stw 3, 88(1)
+; 32BIT-NEXT:    stw 4, 92(1)
+; 32BIT-NEXT:    stw 5, 96(1)
+; 32BIT-NEXT:    stw 6, 100(1)
+; 32BIT-NEXT:    stw 7, 104(1)
+; 32BIT-NEXT:    stw 8, 108(1)
+; 32BIT-NEXT:    stw 9, 112(1)
+; 32BIT-NEXT:    stw 10, 116(1)
 ; 32BIT-NEXT:    bl .foo[PR]
 ; 32BIT-NEXT:    nop
 ; 32BIT-NEXT:    addi 1, 1, 64
@@ -28,14 +28,14 @@ define void @i64_join(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i6
 ; 64BIT-NEXT:    mflr 0
 ; 64BIT-NEXT:    stdu 1, -112(1)
 ; 64BIT-NEXT:    std 0, 128(1)
-; 64BIT-NEXT:    std 10, 216(1)
-; 64BIT-NEXT:    std 9, 208(1)
-; 64BIT-NEXT:    std 8, 200(1)
-; 64BIT-NEXT:    std 7, 192(1)
-; 64BIT-NEXT:    std 6, 184(1)
-; 64BIT-NEXT:    std 5, 176(1)
-; 64BIT-NEXT:    std 4, 168(1)
 ; 64BIT-NEXT:    std 3, 160(1)
+; 64BIT-NEXT:    std 4, 168(1)
+; 64BIT-NEXT:    std 5, 176(1)
+; 64BIT-NEXT:    std 6, 184(1)
+; 64BIT-NEXT:    std 7, 192(1)
+; 64BIT-NEXT:    std 8, 200(1)
+; 64BIT-NEXT:    std 9, 208(1)
+; 64BIT-NEXT:    std 10, 216(1)
 ; 64BIT-NEXT:    bl .foo[PR]
 ; 64BIT-NEXT:    nop
 ; 64BIT-NEXT:    addi 1, 1, 112
@@ -56,20 +56,20 @@ entry:
   ret void
 }
 
-define void @i64_join_missing(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h, i64 %i, i64 %j) #0 {
+define void @i64_join_missing(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h, i64 %i, i64 %j) {
 ; 32BIT-LABEL: i64_join_missing:
 ; 32BIT:       # %bb.0: # %entry
 ; 32BIT-NEXT:    mflr 0
 ; 32BIT-NEXT:    stwu 1, -64(1)
 ; 32BIT-NEXT:    stw 0, 72(1)
-; 32BIT-NEXT:    stw 10, 116(1)
-; 32BIT-NEXT:    stw 9, 112(1)
-; 32BIT-NEXT:    stw 8, 108(1)
-; 32BIT-NEXT:    stw 7, 104(1)
-; 32BIT-NEXT:    stw 6, 100(1)
-; 32BIT-NEXT:    stw 5, 96(1)
-; 32BIT-NEXT:    stw 4, 92(1)
 ; 32BIT-NEXT:    stw 3, 88(1)
+; 32BIT-NEXT:    stw 4, 92(1)
+; 32BIT-NEXT:    stw 5, 96(1)
+; 32BIT-NEXT:    stw 6, 100(1)
+; 32BIT-NEXT:    stw 7, 104(1)
+; 32BIT-NEXT:    stw 8, 108(1)
+; 32BIT-NEXT:    stw 9, 112(1)
+; 32BIT-NEXT:    stw 10, 116(1)
 ; 32BIT-NEXT:    bl .foo[PR]
 ; 32BIT-NEXT:    nop
 ; 32BIT-NEXT:    addi 1, 1, 64
@@ -82,14 +82,14 @@ define void @i64_join_missing(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i6
 ; 64BIT-NEXT:    mflr 0
 ; 64BIT-NEXT:    stdu 1, -112(1)
 ; 64BIT-NEXT:    std 0, 128(1)
-; 64BIT-NEXT:    std 10, 216(1)
-; 64BIT-NEXT:    std 9, 208(1)
-; 64BIT-NEXT:    std 8, 200(1)
-; 64BIT-NEXT:    std 7, 192(1)
-; 64BIT-NEXT:    std 6, 184(1)
-; 64BIT-NEXT:    std 5, 176(1)
-; 64BIT-NEXT:    std 4, 168(1)
 ; 64BIT-NEXT:    std 3, 160(1)
+; 64BIT-NEXT:    std 4, 168(1)
+; 64BIT-NEXT:    std 5, 176(1)
+; 64BIT-NEXT:    std 6, 184(1)
+; 64BIT-NEXT:    std 7, 192(1)
+; 64BIT-NEXT:    std 8, 200(1)
+; 64BIT-NEXT:    std 9, 208(1)
+; 64BIT-NEXT:    std 10, 216(1)
 ; 64BIT-NEXT:    bl .foo[PR]
 ; 64BIT-NEXT:    nop
 ; 64BIT-NEXT:    addi 1, 1, 112
@@ -109,20 +109,20 @@ entry:
   ret void
 }
 
-define void @i32_join(i32 signext %a, i32 signext %b, i32 signext %c, i32 signext %d, i32 signext %e, i32 signext %f, i32 signext %g, i32 signext %h, i32 signext %i, i32 signext %j) #0 {
+define void @i32_join(i32 signext %a, i32 signext %b, i32 signext %c, i32 signext %d, i32 signext %e, i32 signext %f, i32 signext %g, i32 signext %h, i32 signext %i, i32 signext %j) {
 ; 32BIT-LABEL: i32_join:
 ; 32BIT:       # %bb.0: # %entry
 ; 32BIT-NEXT:    mflr 0
 ; 32BIT-NEXT:    stwu 1, -64(1)
 ; 32BIT-NEXT:    stw 0, 72(1)
-; 32BIT-NEXT:    stw 10, 116(1)
-; 32BIT-NEXT:    stw 9, 112(1)
-; 32BIT-NEXT:    stw 8, 108(1)
-; 32BIT-NEXT:    stw 7, 104(1)
-; 32BIT-NEXT:    stw 6, 100(1)
-; 32BIT-NEXT:    stw 5, 96(1)
-; 32BIT-NEXT:    stw 4, 92(1)
 ; 32BIT-NEXT:    stw 3, 88(1)
+; 32BIT-NEXT:    stw 4, 92(1)
+; 32BIT-NEXT:    stw 5, 96(1)
+; 32BIT-NEXT:    stw 6, 100(1)
+; 32BIT-NEXT:    stw 7, 104(1)
+; 32BIT-NEXT:    stw 8, 108(1)
+; 32BIT-NEXT:    stw 9, 112(1)
+; 32BIT-NEXT:    stw 10, 116(1)
 ; 32BIT-NEXT:    bl .foo[PR]
 ; 32BIT-NEXT:    nop
 ; 32BIT-NEXT:    addi 1, 1, 64
@@ -135,14 +135,14 @@ define void @i32_join(i32 signext %a, i32 signext %b, i32 signext %c, i32 signex
 ; 64BIT-NEXT:    mflr 0
 ; 64BIT-NEXT:    stdu 1, -112(1)
 ; 64BIT-NEXT:    std 0, 128(1)
-; 64BIT-NEXT:    std 10, 216(1)
-; 64BIT-NEXT:    std 9, 208(1)
-; 64BIT-NEXT:    std 8, 200(1)
-; 64BIT-NEXT:    std 7, 192(1)
-; 64BIT-NEXT:    std 6, 184(1)
-; 64BIT-NEXT:    std 5, 176(1)
-; 64BIT-NEXT:    std 4, 168(1)
 ; 64BIT-NEXT:    std 3, 160(1)
+; 64BIT-NEXT:    std 4, 168(1)
+; 64BIT-NEXT:    std 5, 176(1)
+; 64BIT-NEXT:    std 6, 184(1)
+; 64BIT-NEXT:    std 7, 192(1)
+; 64BIT-NEXT:    std 8, 200(1)
+; 64BIT-NEXT:    std 9, 208(1)
+; 64BIT-NEXT:    std 10, 216(1)
 ; 64BIT-NEXT:    bl .foo[PR]
 ; 64BIT-NEXT:    nop
 ; 64BIT-NEXT:    addi 1, 1, 112
@@ -163,20 +163,20 @@ entry:
   ret void
 }
 
-define void @i32_join_missing(i32 signext %a, i32 signext %b, i32 signext %c, i32 signext %d, i32 signext %e, i32 signext %f, i32 signext %g, i32 signext %h, i32 signext %i, i32 signext %j) #0 {
+define void @i32_join_missing(i32 signext %a, i32 signext %b, i32 signext %c, i32 signext %d, i32 signext %e, i32 signext %f, i32 signext %g, i32 signext %h, i32 signext %i, i32 signext %j) {
 ; 32BIT-LABEL: i32_join_missing:
 ; 32BIT:       # %bb.0: # %entry
 ; 32BIT-NEXT:    mflr 0
 ; 32BIT-NEXT:    stwu 1, -64(1)
 ; 32BIT-NEXT:    stw 0, 72(1)
-; 32BIT-NEXT:    stw 10, 116(1)
-; 32BIT-NEXT:    stw 9, 112(1)
-; 32BIT-NEXT:    stw 8, 108(1)
-; 32BIT-NEXT:    stw 7, 104(1)
-; 32BIT-NEXT:    stw 6, 100(1)
-; 32BIT-NEXT:    stw 5, 96(1)
-; 32BIT-NEXT:    stw 4, 92(1)
 ; 32BIT-NEXT:    stw 3, 88(1)
+; 32BIT-NEXT:    stw 4, 92(1)
+; 32BIT-NEXT:    stw 5, 96(1)
+; 32BIT-NEXT:    stw 6, 100(1)
+; 32BIT-NEXT:    stw 7, 104(1)
+; 32BIT-NEXT:    stw 8, 108(1)
+; 32BIT-NEXT:    stw 9, 112(1)
+; 32BIT-NEXT:    stw 10, 116(1)
 ; 32BIT-NEXT:    bl .foo[PR]
 ; 32BIT-NEXT:    nop
 ; 32BIT-NEXT:    addi 1, 1, 64
@@ -189,14 +189,14 @@ define void @i32_join_missing(i32 signext %a, i32 signext %b, i32 signext %c, i3
 ; 64BIT-NEXT:    mflr 0
 ; 64BIT-NEXT:    stdu 1, -112(1)
 ; 64BIT-NEXT:    std 0, 128(1)
-; 64BIT-NEXT:    std 10, 216(1)
-; 64BIT-NEXT:    std 9, 208(1)
-; 64BIT-NEXT:    std 8, 200(1)
-; 64BIT-NEXT:    std 7, 192(1)
-; 64BIT-NEXT:    std 6, 184(1)
-; 64BIT-NEXT:    std 5, 176(1)
-; 64BIT-NEXT:    std 4, 168(1)
 ; 64BIT-NEXT:    std 3, 160(1)
+; 64BIT-NEXT:    std 4, 168(1)
+; 64BIT-NEXT:    std 5, 176(1)
+; 64BIT-NEXT:    std 6, 184(1)
+; 64BIT-NEXT:    std 7, 192(1)
+; 64BIT-NEXT:    std 8, 200(1)
+; 64BIT-NEXT:    std 9, 208(1)
+; 64BIT-NEXT:    std 10, 216(1)
 ; 64BIT-NEXT:    bl .foo[PR]
 ; 64BIT-NEXT:    nop
 ; 64BIT-NEXT:    addi 1, 1, 112
@@ -216,22 +216,22 @@ entry:
   ret void
 }
 
-define void @f32_join(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i, float %j) #0 {
+define void @f32_join(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i, float %j) {
 ; 32BIT-LABEL: f32_join:
 ; 32BIT:       # %bb.0: # %entry
 ; 32BIT-NEXT:    mflr 0
 ; 32BIT-NEXT:    stwu 1, -64(1)
 ; 32BIT-NEXT:    stw 0, 72(1)
-; 32BIT-NEXT:    stfs 10, 124(1)
-; 32BIT-NEXT:    stfs 9, 120(1)
-; 32BIT-NEXT:    stfs 8, 116(1)
-; 32BIT-NEXT:    stfs 7, 112(1)
-; 32BIT-NEXT:    stfs 6, 108(1)
-; 32BIT-NEXT:    stfs 5, 104(1)
-; 32BIT-NEXT:    stfs 4, 100(1)
-; 32BIT-NEXT:    stfs 3, 96(1)
-; 32BIT-NEXT:    stfs 2, 92(1)
 ; 32BIT-NEXT:    stfs 1, 88(1)
+; 32BIT-NEXT:    stfs 2, 92(1)
+; 32BIT-NEXT:    stfs 3, 96(1)
+; 32BIT-NEXT:    stfs 4, 100(1)
+; 32BIT-NEXT:    stfs 5, 104(1)
+; 32BIT-NEXT:    stfs 6, 108(1)
+; 32BIT-NEXT:    stfs 7, 112(1)
+; 32BIT-NEXT:    stfs 8, 116(1)
+; 32BIT-NEXT:    stfs 9, 120(1)
+; 32BIT-NEXT:    stfs 10, 124(1)
 ; 32BIT-NEXT:    bl .foo[PR]
 ; 32BIT-NEXT:    nop
 ; 32BIT-NEXT:    addi 1, 1, 64
@@ -244,16 +244,16 @@ define void @f32_join(float %a, float %b, float %c, float %d, float %e, float %f
 ; 64BIT-NEXT:    mflr 0
 ; 64BIT-NEXT:    stdu 1, -112(1)
 ; 64BIT-NEXT:    std 0, 128(1)
-; 64BIT-NEXT:    stfs 10, 232(1)
-; 64BIT-NEXT:    stfs 9, 224(1)
-; 64BIT-NEXT:    stfs 8, 216(1)
-; 64BIT-NEXT:    stfs 7, 208(1)
-; 64BIT-NEXT:    stfs 6, 200(1)
-; 64BIT-NEXT:    stfs 5, 192(1)
-; 64BIT-NEXT:    stfs 4, 184(1)
-; 64BIT-NEXT:    stfs 3, 176(1)
-; 64BIT-NEXT:    stfs 2, 168(1)
 ; 64BIT-NEXT:    stfs 1, 160(1)
+; 64BIT-NEXT:    stfs 2, 168(1)
+; 64BIT-NEXT:    stfs 3, 176(1)
+; 64BIT-NEXT:    stfs 4, 184(1)
+; 64BIT-NEXT:    stfs 5, 192(1)
+; 64BIT-NEXT:    stfs 6, 200(1)
+; 64BIT-NEXT:    stfs 7, 208(1)
+; 64BIT-NEXT:    stfs 8, 216(1)
+; 64BIT-NEXT:    stfs 9, 224(1)
+; 64BIT-NEXT:    stfs 10, 232(1)
 ; 64BIT-NEXT:    bl .foo[PR]
 ; 64BIT-NEXT:    nop
 ; 64BIT-NEXT:    addi 1, 1, 112
@@ -274,22 +274,22 @@ entry:
   ret void
 }
 
-define void @f32_join_missing(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i, float %j) #0 {
+define void @f32_join_missing(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i, float %j) {
 ; 32BIT-LABEL: f32_join_missing:
 ; 32BIT:       # %bb.0: # %entry
 ; 32BIT-NEXT:    mflr 0
 ; 32BIT-NEXT:    stwu 1, -64(1)
 ; 32BIT-NEXT:    stw 0, 72(1)
-; 32BIT-NEXT:    stfs 10, 124(1)
-; 32BIT-NEXT:    stfs 9, 120(1)
-; 32BIT-NEXT:    stfs 8, 116(1)
-; 32BIT-NEXT:    stfs 7, 112(1)
-; 32BIT-NEXT:    stfs 6, 108(1)
-; 32BIT-NEXT:    stfs 5, 104(1)
-; 32BIT-NEXT:    stfs 4, 100(1)
-; 32BIT-NEXT:    stfs 3, 96(1)
-; 32BIT-NEXT:    stfs 2, 92(1)
 ; 32BIT-NEXT:    stfs 1, 88(1)
+; 32BIT-NEXT:    stfs 2, 92(1)
+; 32BIT-NEXT:    stfs 3, 96(1)
+; 32BIT-NEXT:    stfs 4, 100(1)
+; 32BIT-NEXT:    stfs 5, 104(1)
+; 32BIT-NEXT:    stfs 6, 108(1)
+; 32BIT-NEXT:    stfs 7, 112(1)
+; 32BIT-NEXT:    stfs 8, 116(1)
+; 32BIT-NEXT:    stfs 9, 120(1)
+; 32BIT-NEXT:    stfs 10, 124(1)
 ; 32BIT-NEXT:    bl .foo[PR]
 ; 32BIT-NEXT:    nop
 ; 32BIT-NEXT:    addi 1, 1, 64
@@ -302,16 +302,16 @@ define void @f32_join_missing(float %a, float %b, float %c, float %d, float %e,
 ; 64BIT-NEXT:    mflr 0
 ; 64BIT-NEXT:    stdu 1, -112(1)
 ; 64BIT-NEXT:    std 0, 128(1)
-; 64BIT-NEXT:    stfs 10, 232(1)
-; 64BIT-NEXT:    stfs 9, 224(1)
-; 64BIT-NEXT:    stfs 8, 216(1)
-; 64BIT-NEXT:    stfs 7, 208(1)
-; 64BIT-NEXT:    stfs 6, 200(1)
-; 64BIT-NEXT:    stfs 5, 192(1)
-; 64BIT-NEXT:    stfs 4, 184(1)
-; 64BIT-NEXT:    stfs 3, 176(1)
-; 64BIT-NEXT:    stfs 2, 168(1)
 ; 64BIT-NEXT:    stfs 1, 160(1)
+; 64BIT-NEXT:    stfs 2, 168(1)
+; 64BIT-NEXT:    stfs 3, 176(1)
+; 64BIT-NEXT:    stfs 4, 184(1)
+; 64BIT-NEXT:    stfs 5, 192(1)
+; 64BIT-NEXT:    stfs 6, 200(1)
+; 64BIT-NEXT:    stfs 7, 208(1)
+; 64BIT-NEXT:    stfs 8, 216(1)
+; 64BIT-NEXT:    stfs 9, 224(1)
+; 64BIT-NEXT:    stfs 10, 232(1)
 ; 64BIT-NEXT:    bl .foo[PR]
 ; 64BIT-NEXT:    nop
 ; 64BIT-NEXT:    addi 1, 1, 112
@@ -332,22 +332,22 @@ entry:
   ret void
 }
 
-define void @f64_join(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i, double %j) #0 {
+define void @f64_join(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i, double %j) {
 ; 32BIT-LABEL: f64_join:
 ; 32BIT:       # %bb.0: # %entry
 ; 32BIT-NEXT:    mflr 0
 ; 32BIT-NEXT:    stwu 1, -64(1)
 ; 32BIT-NEXT:    stw 0, 72(1)
-; 32BIT-NEXT:    stfd 10, 160(1)
-; 32BIT-NEXT:    stfd 9, 152(1)
-; 32BIT-NEXT:    stfd 8, 144(1)
-; 32BIT-NEXT:    stfd 7, 136(1)
-; 32BIT-NEXT:    stfd 6, 128(1)
-; 32BIT-NEXT:    stfd 5, 120(1)
-; 32BIT-NEXT:    stfd 4, 112(1)
-; 32BIT-NEXT:    stfd 3, 104(1)
-; 32BIT-NEXT:    stfd 2, 96(1)
 ; 32BIT-NEXT:    stfd 1, 88(1)
+; 32BIT-NEXT:    stfd 2, 96(1)
+; 32BIT-NEXT:    stfd 3, 104(1)
+; 32BIT-NEXT:    stfd 4, 112(1)
+; 32BIT-NEXT:    stfd 5, 120(1)
+; 32BIT-NEXT:    stfd 6, 128(1)
+; 32BIT-NEXT:    stfd 7, 136(1)
+; 32BIT-NEXT:    stfd 8, 144(1)
+; 32BIT-NEXT:    stfd 9, 152(1)
+; 32BIT-NEXT:    stfd 10, 160(1)
 ; 32BIT-NEXT:    bl .foo[PR]
 ; 32BIT-NEXT:    nop
 ; 32BIT-NEXT:    addi 1, 1, 64
@@ -360,16 +360,16 @@ define void @f64_join(double %a, double %b, double %c, double %d, double %e, dou
 ; 64BIT-NEXT:    mflr 0
 ; 64BIT-NEXT:    stdu 1, -112(1)
 ; 64BIT-NEXT:    std 0, 128(1)
-; 64BIT-NEXT:    stfd 10, 232(1)
-; 64BIT-NEXT:    stfd 9, 224(1)
-; 64BIT-NEXT:    stfd 8, 216(1)
-; 64BIT-NEXT:    stfd 7, 208(1)
-; 64BIT-NEXT:    stfd 6, 200(1)
-; 64BIT-NEXT:    stfd 5, 192(1)
-; 64BIT-NEXT:    stfd 4, 184(1)
-; 64BIT-NEXT:    stfd 3, 176(1)
-; 64BIT-NEXT:    stfd 2, 168(1)
 ; 64BIT-NEXT:    stfd 1, 160(1)
+; 64BIT-NEXT:    stfd 2, 168(1)
+; 64BIT-NEXT:    stfd 3, 176(1)
+; 64BIT-NEXT:    stfd 4, 184(1)
+; 64BIT-NEXT:    stfd 5, 192(1)
+; 64BIT-NEXT:    stfd 6, 200(1)
+; 64BIT-NEXT:    stfd 7, 208(1)
+; 64BIT-NEXT:    stfd 8, 216(1)
+; 64BIT-NEXT:    stfd 9, 224(1)
+; 64BIT-NEXT:    stfd 10, 232(1)
 ; 64BIT-NEXT:    bl .foo[PR]
 ; 64BIT-NEXT:    nop
 ; 64BIT-NEXT:    addi 1, 1, 112
@@ -390,22 +390,22 @@ entry:
   ret void
 }
 
-define void @f64_missing(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i, double %j) #0 {
+define void @f64_missing(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i, double %j) {
 ; 32BIT-LABEL: f64_missing:
 ; 32BIT:       # %bb.0: # %entry
 ; 32BIT-NEXT:    mflr 0
 ; 32BIT-NEXT:    stwu 1, -64(1)
 ; 32BIT-NEXT:    stw 0, 72(1)
-; 32BIT-NEXT:    stfd 10, 160(1)
-; 32BIT-NEXT:    stfd 9, 152(1)
-; 32BIT-NEXT:    stfd 8, 144(1)
-; 32BIT-NEXT:    stfd 7, 136(1)
-; 32BIT-NEXT:    stfd 6, 128(1)
-; 32BIT-NEXT:    stfd 5, 120(1)
-; 32BIT-NEXT:    stfd 4, 112(1)
-; 32BIT-NEXT:    stfd 3, 104(1)
-; 32BIT-NEXT:    stfd 2, 96(1)
 ; 32BIT-NEXT:    stfd 1, 88(1)
+; 32BIT-NEXT:    stfd 2, 96(1)
+; 32BIT-NEXT:    stfd 3, 104(1)
+; 32BIT-NEXT:    stfd 4, 112(1)
+; 32BIT-NEXT:    stfd 5, 120(1)
+; 32BIT-NEXT:    stfd 6, 128(1)
+; 32BIT-NEXT:    stfd 7, 136(1)
+; 32BIT-NEXT:    stfd 8, 144(1)
+; 32BIT-NEXT:    stfd 9, 152(1)
+; 32BIT-NEXT:    stfd 10, 160(1)
 ; 32BIT-NEXT:    bl .foo[PR]
 ; 32BIT-NEXT:    nop
 ; 32BIT-NEXT:    addi 1, 1, 64
@@ -418,16 +418,16 @@ define void @f64_missing(double %a, double %b, double %c, double %d, double %e,
 ; 64BIT-NEXT:    mflr 0
 ; 64BIT-NEXT:    stdu 1, -112(1)
 ; 64BIT-NEXT:    std 0, 128(1)
-; 64BIT-NEXT:    stfd 10, 232(1)
-; 64BIT-NEXT:    stfd 9, 224(1)
-; 64BIT-NEXT:    stfd 8, 216(1)
-; 64BIT-NEXT:    stfd 7, 208(1)
-; 64BIT-NEXT:    stfd 6, 200(1)
-; 64BIT-NEXT:    stfd 5, 192(1)
-; 64BIT-NEXT:    stfd 4, 184(1)
-; 64BIT-NEXT:    stfd 3, 176(1)
-; 64BIT-NEXT:    stfd 2, 168(1)
 ; 64BIT-NEXT:    stfd 1, 160(1)
+; 64BIT-NEXT:    stfd 2, 168(1)
+; 64BIT-NEXT:    stfd 3, 176(1)
+; 64BIT-NEXT:    stfd 4, 184(1)
+; 64BIT-NEXT:    stfd 5, 192(1)
+; 64BIT-NEXT:    stfd 6, 200(1)
+; 64BIT-NEXT:    stfd 7, 208(1)
+; 64BIT-NEXT:    stfd 8, 216(1)
+; 64BIT-NEXT:    stfd 9, 224(1)
+; 64BIT-NEXT:    stfd 10, 232(1)
 ; 64BIT-NEXT:    bl .foo[PR]
 ; 64BIT-NEXT:    nop
 ; 64BIT-NEXT:    addi 1, 1, 112
@@ -448,7 +448,7 @@ entry:
   ret void
 }
 
-define void @mixed_1(double %a, i64 %b, i64 %c, i32 signext %d, i64 %e, float %f, float %g, double %h, i32 signext %i, double %j) #0 {
+define void @mixed_1(double %a, i64 %b, i64 %c, i32 signext %d, i64 %e, float %f, float %g, double %h, i32 signext %i, double %j) {
 ; 32BIT-LABEL: mixed_1:
 ; 32BIT:       # %bb.0: # %entry
 ; 32BIT-NEXT:    mflr 0
@@ -587,7 +587,7 @@ entry:
   ret void
 }
 
-define void @mixed_2(<2 x double> %a, <4 x i32> %b, i64 %c) #0 {
+define void @mixed_2(<2 x double> %a, <4 x i32> %b, i64 %c) {
 ; 32BIT-LABEL: mixed_2:
 ; 32BIT:       # %bb.0: # %entry
 ; 32BIT-NEXT:    mflr 0
@@ -658,7 +658,7 @@ entry:
 
 %struct.foo = type <{ [3 x i32], double, [12 x i8], <4 x i32> }>
 
-define void @mixed_3(<2 x double> %a, i64 %b, double %c, float %d, i32 signext %e, double %f, ...) #0 {
+define void @mixed_3(<2 x double> %a, i64 %b, double %c, float %d, i32 signext %e, double %f, ...) {
 ; 32BIT-LABEL: mixed_3:
 ; 32BIT:       # %bb.0: # %entry
 ; 32BIT-NEXT:    mflr 0
@@ -720,10 +720,10 @@ define void @mixed_3(<2 x double> %a, i64 %b, double %c, float %d, i32 signext %
 ; 64BIT-NEXT:    stxvd2x 34, 0, 3
 ; 64BIT-NEXT:    xsadddp 1, 0, 1
 ; 64BIT-NEXT:    std 5, 208(1)
-; 64BIT-NEXT:    stfs 2, 232(1)
 ; 64BIT-NEXT:    std 6, 224(1)
-; 64BIT-NEXT:    std 8, 248(1)
+; 64BIT-NEXT:    stfs 2, 232(1)
 ; 64BIT-NEXT:    std 7, 240(1)
+; 64BIT-NEXT:    std 8, 248(1)
 ; 64BIT-NEXT:    stfd 3, 256(1)
 ; 64BIT-NEXT:    std 9, 264(1)
 ; 64BIT-NEXT:    std 10, 248(1)
@@ -757,7 +757,7 @@ entry:
   ret void
 }
 
-define signext i32 @mixed_4(ptr byval(%struct.foo) align 16 %foo, i32 %sec) #0 {
+define signext i32 @mixed_4(ptr byval(%struct.foo) align 16 %foo, i32 %sec) {
 ; 32BIT-LABEL: mixed_4:
 ; 32BIT:       # %bb.0: # %entry
 ; 32BIT-NEXT:    stw 9, 48(1)
@@ -807,10 +807,70 @@ entry:
   ret i32 %add1
 }
 
-declare void @foo() #0
-declare void @consume_f64(double) #0
-declare void @consume_f32(float) #0
-declare void @consume_i64(i64) #0
-declare void @consume_i32(i32 signext) #0
+%struct.bar = type { i8, i32, <4 x i32>, ptr, i8 }
+
+define void @mixed_5(ptr byref(%struct.bar) align 16 %r, ptr byval(%struct.bar) align 16 %x, i32 signext %y, ptr byval(%struct.foo) align 16 %f) {
+; 32BIT-LABEL: mixed_5:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    stw 3, 88(1)
+; 32BIT-NEXT:    lfd 1, 172(1)
+; 32BIT-NEXT:    stw 5, 96(1)
+; 32BIT-NEXT:    stw 6, 100(1)
+; 32BIT-NEXT:    stw 7, 104(1)
+; 32BIT-NEXT:    stw 8, 108(1)
+; 32BIT-NEXT:    stw 9, 112(1)
+; 32BIT-NEXT:    stw 10, 116(1)
+; 32BIT-NEXT:    bl .consume_f64[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    lwz 3, 100(1)
+; 32BIT-NEXT:    bl .consume_i32[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: mixed_5:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    std 3, 160(1)
+; 64BIT-NEXT:    lfd 1, 252(1)
+; 64BIT-NEXT:    std 5, 176(1)
+; 64BIT-NEXT:    std 6, 184(1)
+; 64BIT-NEXT:    std 7, 192(1)
+; 64BIT-NEXT:    std 8, 200(1)
+; 64BIT-NEXT:    std 9, 208(1)
+; 64BIT-NEXT:    std 10, 216(1)
+; 64BIT-NEXT:    bl .consume_f64[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    lwa 3, 180(1)
+; 64BIT-NEXT:    bl .consume_i32[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
+entry:
+  %d = getelementptr inbounds i8, ptr %f, i64 12
+  %0 = load double, ptr %d, align 4
+  tail call void @consume_f64(double %0)
+  %i = getelementptr inbounds i8, ptr %x, i64 4
+  %1 = load i32, ptr %i, align 4
+  tail call void @consume_i32(i32 signext %1)
+  ret void
+}
+
+declare void @foo()
+declare void @consume_f64(double)
+declare void @consume_f32(float)
+declare void @consume_i64(i64)
+declare void @consume_i32(i32 signext)
+
+!llvm.module.flags = !{!0}
 
-attributes #0 = { "save-reg-params" nofree noinline nounwind }
+!0 = !{i32 1, !"save-reg-params", i32 1}



More information about the cfe-commits mailing list