[llvm] b373ec8 - [AIX] Implement caller arguments passed in stack memory.

Thu Feb 6 09:08:17 PST 2020

Author: Chris Bowler
Date: 2020-02-06T12:07:34-05:00
New Revision: b373ec8ce76a00cd079f70d108b13129c3a4a7b9

URL: https://github.com/llvm/llvm-project/commit/b373ec8ce76a00cd079f70d108b13129c3a4a7b9
DIFF: https://github.com/llvm/llvm-project/commit/b373ec8ce76a00cd079f70d108b13129c3a4a7b9.diff

LOG: [AIX] Implement caller arguments passed in stack memory.

This patch implements the caller side of placing function call arguments
in stack memory. This removes the current limitation where LLVM on AIX
will report fatal error when arguments can't be contained in registers.

There is a particular oddity that a float argument that passes in a
register and also in stack memory requires that the caller initialize
both. From what AIX "ABI" documentation I have it's not clear that this
needs to be done, however, it is necessary for compatibility with the
AIX XL compiler so I think it's best to implement it the same way.

Note a later patch will follow to address the callee side.

Differential Revision: https://reviews.llvm.org/D73209

Added: 
    

Modified: 
    llvm/lib/Target/PowerPC/PPCISelLowering.cpp
    llvm/test/CodeGen/PowerPC/aix-cc-abi.ll

Removed: 
    llvm/test/CodeGen/PowerPC/aix-stackargs.ll


################################################################################
diff  --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 2e086fad5757..c7e9b5fb39fe 100644

--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -6838,10 +6838,10 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
     assert(IsPPC64 && "PPC32 should have split i64 values.");
     LLVM_FALLTHROUGH;
   case MVT::i1:
-  case MVT::i32:
-    State.AllocateStack(PtrByteSize, PtrByteSize);
+  case MVT::i32: {
+    const unsigned Offset = State.AllocateStack(PtrByteSize, PtrByteSize);
+    const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
     if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32)) {
-      MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
       // Promote integers if needed.
       if (ValVT.getSizeInBits() < RegVT.getSizeInBits())
         LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt
@@ -6849,38 +6849,46 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));
     }
     else
-      report_fatal_error("Handling of placing parameters on the stack is "
-                         "unimplemented!");
-    return false;
+      State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, RegVT, LocInfo));
 
+    return false;
+  }
   case MVT::f32:
   case MVT::f64: {
     // Parameter save area (PSA) is reserved even if the float passes in fpr.
     const unsigned StoreSize = LocVT.getStoreSize();
     // Floats are always 4-byte aligned in the PSA on AIX.
     // This includes f64 in 64-bit mode for ABI compatibility.
-    State.AllocateStack(IsPPC64 ? 8 : StoreSize, 4);
-    if (unsigned Reg = State.AllocateReg(FPR))
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-    else
-      report_fatal_error("Handling of placing parameters on the stack is "
-                         "unimplemented!");
+    const unsigned Offset = State.AllocateStack(IsPPC64 ? 8 : StoreSize, 4);
+    unsigned FReg = State.AllocateReg(FPR);
+    if (FReg)
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, FReg, LocVT, LocInfo));
 
-    // AIX requires that GPRs are reserved for float arguments.
-    // Successfully reserved GPRs are only initialized for vararg calls.
-    MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
+    // Reserve and initialize GPRs or initialize the PSA as required.
+    const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
     for (unsigned I = 0; I < StoreSize; I += PtrByteSize) {
       if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32)) {
+        assert(FReg && "An FPR should be available when a GPR is reserved.");
         if (State.isVarArg()) {
+          // Successfully reserved GPRs are only initialized for vararg calls.
           // Custom handling is required for:
           //   f64 in PPC32 needs to be split into 2 GPRs.
           //   f32 in PPC64 needs to occupy only lower 32 bits of 64-bit GPR.
           State.addLoc(
               CCValAssign::getCustomReg(ValNo, ValVT, Reg, RegVT, LocInfo));
         }
-      } else if (State.isVarArg()) {
-        report_fatal_error("Handling of placing parameters on the stack is "
-                           "unimplemented!");
+      } else {
+        // If there are insufficient GPRs, the PSA needs to be initialized.
+        // Initialization occurs even if an FPR was initialized for
+        // compatibility with the AIX XL compiler. The full memory for the
+        // argument will be initialized even if a prior word is saved in GPR.
+        // A custom memLoc is used when the argument also passes in FPR so
+        // that the callee handling can skip over it easily.
+        State.addLoc(
+            FReg ? CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT,
+                                             LocInfo)
+                 : CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+        break;
       }
     }
 
@@ -6963,27 +6971,36 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
   CCInfo.AllocateStack(LinkageSize + MinParameterSaveArea, PtrByteSize);
   CCInfo.AnalyzeFormalArguments(Ins, CC_AIX);
 
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
-    SDValue ArgValue;
-    ISD::ArgFlagsTy Flags = Ins[i].Flags;
-    if (VA.isRegLoc()) {
-      EVT ValVT = VA.getValVT();
-      MVT LocVT = VA.getLocVT();
-      MVT::SimpleValueType SVT = ValVT.getSimpleVT().SimpleTy;
-      unsigned VReg =
-          MF.addLiveIn(VA.getLocReg(), getRegClassForSVT(SVT, IsPPC64));
-      ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
-      if (ValVT.isScalarInteger() &&
-          (ValVT.getSizeInBits() < LocVT.getSizeInBits())) {
-        ArgValue =
-            truncateScalarIntegerArg(Flags, ValVT, DAG, ArgValue, LocVT, dl);
-      }
-      InVals.push_back(ArgValue);
-    } else {
-      report_fatal_error("Handling of formal arguments on the stack is "
-                         "unimplemented!");
+  for (CCValAssign &VA : ArgLocs) {
+
+    if (VA.isMemLoc()) {
+      // For compatibility with the AIX XL compiler, the float args in the
+      // parameter save area are initialized even if the argument is available
+      // in register.  The caller is required to initialize both the register
+      // and memory, however, the callee can choose to expect it in either.  The
+      // memloc is dismissed here because the argument is retrieved from the
+      // register.
+      if (VA.needsCustom())
+        continue;
+      report_fatal_error(
+          "Handling of formal arguments on the stack is unimplemented!");
+    }
+
+    assert(VA.isRegLoc() && "Unexpected argument location.");
+
+    EVT ValVT = VA.getValVT();
+    MVT LocVT = VA.getLocVT();
+    MVT::SimpleValueType SVT = ValVT.getSimpleVT().SimpleTy;
+    unsigned VReg =
+        MF.addLiveIn(VA.getLocReg(), getRegClassForSVT(SVT, IsPPC64));
+    SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
+    if (ValVT.isScalarInteger() &&
+        (ValVT.getSizeInBits() < LocVT.getSizeInBits())) {
+      ISD::ArgFlagsTy Flags = Ins[VA.getValNo()].Flags;
+      ArgValue =
+          truncateScalarIntegerArg(Flags, ValVT, DAG, ArgValue, LocVT, dl);
     }
+    InVals.push_back(ArgValue);
   }
 
   // Area that is at least reserved in the caller of this function.
@@ -7035,6 +7052,7 @@ SDValue PPCTargetLowering::LowerCall_AIX(
   // The LSA is 24 bytes (6x4) in PPC32 and 48 bytes (6x8) in PPC64.
   const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
   const bool IsPPC64 = Subtarget.isPPC64();
+  const EVT PtrVT = getPointerTy(DAG.getDataLayout());
   const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
   CCInfo.AllocateStack(LinkageSize, PtrByteSize);
   CCInfo.AnalyzeCallOperands(Outs, CC_AIX);
@@ -7046,7 +7064,8 @@ SDValue PPCTargetLowering::LowerCall_AIX(
   // conservatively assume that it is needed.  As such, make sure we have at
   // least enough stack space for the caller to store the 8 GPRs.
   const unsigned MinParameterSaveAreaSize = 8 * PtrByteSize;
-  const unsigned NumBytes = LinkageSize + MinParameterSaveAreaSize;
+  const unsigned NumBytes = std::max(LinkageSize + MinParameterSaveAreaSize,
+                                     CCInfo.getNextStackOffset());
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass.
@@ -7054,20 +7073,23 @@ SDValue PPCTargetLowering::LowerCall_AIX(
   SDValue CallSeqStart = Chain;
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  // Set up a copy of the stack pointer for loading and storing any
+  // arguments that may not fit in the registers available for argument
+  // passing.
+  const SDValue StackPtr = IsPPC64 ? DAG.getRegister(PPC::X1, MVT::i64)
+                                   : DAG.getRegister(PPC::R1, MVT::i32);
 
   for (unsigned I = 0, E = ArgLocs.size(); I != E;) {
     CCValAssign &VA = ArgLocs[I++];
 
-    if (VA.isMemLoc())
-      report_fatal_error("Handling of placing parameters on the stack is "
-                         "unimplemented!");
-    if (!VA.isRegLoc())
-      report_fatal_error(
-          "Unexpected non-register location for function call argument.");
-
     SDValue Arg = OutVals[VA.getValNo()];
 
-    if (!VA.needsCustom()) {
+    if (!VA.isRegLoc() && !VA.isMemLoc())
+      report_fatal_error("Unexpected location for function call argument.");
+
+    if (VA.isRegLoc() && !VA.needsCustom()) {
       switch (VA.getLocInfo()) {
       default:
         report_fatal_error("Unexpected argument extension type.");
@@ -7085,11 +7107,21 @@ SDValue PPCTargetLowering::LowerCall_AIX(
       continue;
     }
 
+    if (VA.isMemLoc()) {
+      SDValue PtrOff =
+          DAG.getConstant(VA.getLocMemOffset(), dl, StackPtr.getValueType());
+      PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+      MemOpChains.push_back(
+          DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
+
+      continue;
+    }
+
     // Custom handling is used for GPR initializations for vararg float
     // arguments.
-    assert(CFlags.IsVarArg && VA.getValVT().isFloatingPoint() &&
-           VA.getLocVT().isInteger() &&
-           "Unexpected custom register handling for calling convention.");
+    assert(VA.isRegLoc() && VA.needsCustom() && CFlags.IsVarArg &&
+           VA.getValVT().isFloatingPoint() && VA.getLocVT().isInteger() &&
+           "Unexpected register handling for calling convention.");
 
     SDValue ArgAsInt =
         DAG.getBitcast(MVT::getIntegerVT(VA.getValVT().getSizeInBits()), Arg);
@@ -7112,15 +7144,24 @@ SDValue PPCTargetLowering::LowerCall_AIX(
                                      DAG.getConstant(32, dl, MVT::i8));
       RegsToPass.push_back(std::make_pair(
           GPR1.getLocReg(), DAG.getZExtOrTrunc(MSWAsI64, dl, MVT::i32)));
-      assert(I != E && "A second custom GPR is expected!");
-      CCValAssign &GPR2 = ArgLocs[I++];
-      assert(GPR2.isRegLoc() && GPR2.getValNo() == GPR1.getValNo() &&
-             GPR2.needsCustom() && "A second custom GPR is expected!");
-      RegsToPass.push_back(std::make_pair(
-          GPR2.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, MVT::i32)));
+
+      if (I != E) {
+        // If only 1 GPR was available, there will only be one custom GPR and
+        // the argument will also pass in memory.
+        CCValAssign &PeekArg = ArgLocs[I];
+        if (PeekArg.isRegLoc() && PeekArg.getValNo() == PeekArg.getValNo()) {
+          assert(PeekArg.needsCustom() && "A second custom GPR is expected.");
+          CCValAssign &GPR2 = ArgLocs[I++];
+          RegsToPass.push_back(std::make_pair(
+              GPR2.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, MVT::i32)));
+        }
+      }
     }
   }
 
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
   // For indirect calls, we need to save the TOC base to the stack for
   // restoration after the call.
   if (CFlags.IsIndirect) {

diff  --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll
index 7ef16d77a264..83fd8359097a 100644
--- a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll
@@ -447,41 +447,109 @@ entry:
 
 ; CHECK-LABEL: name: call_test_fpr_max{{.*}}
 
-; 32BIT:      renamable $r3 = LWZtoc @d1, $r2 :: (load 4 from got)
-; 32BIT-NEXT: renamable $f1 = LFD 0, killed renamable $r3 :: (dereferenceable load 8 from @d1)
-; 32BIT-NEXT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT: $f2 = COPY renamable $f1
-; 32BIT-NEXT: $f3 = COPY renamable $f1
-; 32BIT-NEXT: $f4 = COPY renamable $f1
-; 32BIT-NEXT: $f5 = COPY renamable $f1
-; 32BIT-NEXT: $f6 = COPY renamable $f1
-; 32BIT-NEXT: $f7 = COPY renamable $f1
-; 32BIT-NEXT: $f8 = COPY renamable $f1
-; 32BIT-NEXT: $f9 = COPY renamable $f1
-; 32BIT-NEXT: $f10 = COPY renamable $f1
-; 32BIT-NEXT: $f11 = COPY renamable $f1
-; 32BIT-NEXT: $f12 = COPY renamable $f1
-; 32BIT-NEXT: $f13 = COPY renamable $f1
-; 32BIT-NEXT: BL_NOP <mcsymbol .test_fpr_max>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit killed $f2, implicit killed $f3, implicit killed $f4, implicit killed $f5, implicit killed $f6, implicit killed $f7, implicit killed $f8, implicit killed $f9, implicit killed $f10, implicit killed $f11, implicit killed $f12, implicit killed $f13, implicit $r2, implicit-def $r1
-; 32BIT-NEXT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT:      renamable $x3 = LDtoc @d1, $x2 :: (load 8 from got)
-; 64BIT-NEXT: renamable $f1 = LFD 0, killed renamable $x3 :: (dereferenceable load 8 from @d1)
-; 64BIT-NEXT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT: $f2 = COPY renamable $f1
-; 64BIT-NEXT: $f3 = COPY renamable $f1
-; 64BIT-NEXT: $f4 = COPY renamable $f1
-; 64BIT-NEXT: $f5 = COPY renamable $f1
-; 64BIT-NEXT: $f6 = COPY renamable $f1
-; 64BIT-NEXT: $f7 = COPY renamable $f1
-; 64BIT-NEXT: $f8 = COPY renamable $f1
-; 64BIT-NEXT: $f9 = COPY renamable $f1
-; 64BIT-NEXT: $f10 = COPY renamable $f1
-; 64BIT-NEXT: $f11 = COPY renamable $f1
-; 64BIT-NEXT: $f12 = COPY renamable $f1
-; 64BIT-NEXT: $f13 = COPY renamable $f1
+; 32BIT:      renamable $r[[REG:[0-9]+]] = LWZtoc @d1, $r2 :: (load 4 from got)
+; 32BIT-NEXT: renamable $f1 = LFD 0, killed renamable $r[[REG]] :: (dereferenceable load 8 from @d1)
+; 32BIT-NEXT: ADJCALLSTACKDOWN 128, 0, implicit-def dead $r1, implicit $r1
+; 32BIT-DAG:  STFD renamable $f1, 56, $r1 :: (store 8)
+; 32BIT-DAG:  STFD renamable $f1, 64, $r1 :: (store 8)
+; 32BIT-DAG:  STFD renamable $f1, 72, $r1 :: (store 8)
+; 32BIT-DAG:  STFD renamable $f1, 80, $r1 :: (store 8)
+; 32BIT-DAG:  STFD renamable $f1, 88, $r1 :: (store 8)
+; 32BIT-DAG:  STFD renamable $f1, 96, $r1 :: (store 8)
+; 32BIT-DAG:  STFD renamable $f1, 104, $r1 :: (store 8)
+; 32BIT-DAG:  STFD renamable $f1, 112, $r1 :: (store 8)
+; 32BIT-DAG:  STFD renamable $f1, 120, $r1 :: (store 8)
+; 32BIT-DAG:  $f2 = COPY renamable $f1
+; 32BIT-DAG:  $f3 = COPY renamable $f1
+; 32BIT-DAG:  $f4 = COPY renamable $f1
+; 32BIT-DAG:  $f5 = COPY renamable $f1
+; 32BIT-DAG:  $f6 = COPY renamable $f1
+; 32BIT-DAG:  $f7 = COPY renamable $f1
+; 32BIT-DAG:  $f8 = COPY renamable $f1
+; 32BIT-DAG:  $f9 = COPY renamable $f1
+; 32BIT-DAG:  $f10 = COPY renamable $f1
+; 32BIT-DAG:  $f11 = COPY renamable $f1
+; 32BIT-DAG:  $f12 = COPY renamable $f1
+; 32BIT-DAG:  $f13 = COPY renamable $f1
+; 32BIT-NEXT: BL_NOP <mcsymbol .test_fpr_max>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit killed $f2, implicit killed $f3, implicit killed $f4, implicit killed $f5, implicit killed $f6, implicit killed $f7, implicit killed $f8, implicit killed $f9, implicit killed $f10, implicit killed $f11, implicit killed $f12, implicit killed $f13, implicit $r2, implicit-def $r1, implicit-def dead $f1
+; 32BIT-NEXT: ADJCALLSTACKUP 128, 0, implicit-def dead $r1, implicit $r1
+
+; CHECKASM-LABEL: .call_test_fpr_max:
+
+; ASM32PWR4:       stwu 1, -128(1)
+; ASM32PWR4-NEXT:  lwz [[REG:[0-9]+]], LC2(2)
+; ASM32PWR4-NEXT:  lfd 1, 0([[REG]])
+; ASM32PWR4-DAG:   stfd 1, 56(1)
+; ASM32PWR4-DAG:   stfd 1, 64(1)
+; ASM32PWR4-DAG:   stfd 1, 72(1)
+; ASM32PWR4-DAG:   stfd 1, 80(1)
+; ASM32PWR4-DAG:   stfd 1, 88(1)
+; ASM32PWR4-DAG:   stfd 1, 96(1)
+; ASM32PWR4-DAG:   stfd 1, 104(1)
+; ASM32PWR4-DAG:   stfd 1, 112(1)
+; ASM32PWR4-DAG:   stfd 1, 120(1)
+; ASM32PWR4-DAG:   fmr 2, 1
+; ASM32PWR4-DAG:   fmr 3, 1
+; ASM32PWR4-DAG:   fmr 4, 1
+; ASM32PWR4-DAG:   fmr 5, 1
+; ASM32PWR4-DAG:   fmr 6, 1
+; ASM32PWR4-DAG:   fmr 7, 1
+; ASM32PWR4-DAG:   fmr 8, 1
+; ASM32PWR4-DAG:   fmr 9, 1
+; ASM32PWR4-DAG:   fmr 10, 1
+; ASM32PWR4-DAG:   fmr 11, 1
+; ASM32PWR4-DAG:   fmr 12, 1
+; ASM32PWR4-DAG:   fmr 13, 1
+; ASM32PWR4-NEXT:  bl .test_fpr_max
+; ASM32PWR4-NEXT:  nop
+; ASM32PWR4-NEXT:  addi 1, 1, 128
+
+; 64BIT:      renamable $x[[REGD1ADDR:[0-9]+]] = LDtoc @d1, $x2 :: (load 8 from got)
+; 64BIT-NEXT: renamable $f1 = LFD 0, killed renamable $x[[REGD1ADDR:[0-9]+]] :: (dereferenceable load 8 from @d1)
+; 64BIT-NEXT: ADJCALLSTACKDOWN 152, 0, implicit-def dead $r1, implicit $r1
+; 64BIT-DAG:  STFD renamable $f1, 112, $x1 :: (store 8)
+; 64BIT-DAG:  STFD renamable $f1, 120, $x1 :: (store 8)
+; 64BIT-DAG:  STFD renamable $f1, 128, $x1 :: (store 8)
+; 64BIT-DAG:  STFD renamable $f1, 136, $x1 :: (store 8)
+; 64BIT-DAG:  STFD renamable $f1, 144, $x1 :: (store 8)
+; 64BIT-DAG:  $f2 = COPY renamable $f1
+; 64BIT-DAG:  $f3 = COPY renamable $f1
+; 64BIT-DAG:  $f4 = COPY renamable $f1
+; 64BIT-DAG:  $f5 = COPY renamable $f1
+; 64BIT-DAG:  $f6 = COPY renamable $f1
+; 64BIT-DAG:  $f7 = COPY renamable $f1
+; 64BIT-DAG:  $f8 = COPY renamable $f1
+; 64BIT-DAG:  $f9 = COPY renamable $f1
+; 64BIT-DAG:  $f10 = COPY renamable $f1
+; 64BIT-DAG:  $f11 = COPY renamable $f1
+; 64BIT-DAG:  $f12 = COPY renamable $f1
+; 64BIT-DAG:  $f13 = COPY renamable $f1
 ; 64BIT-NEXT: BL8_NOP <mcsymbol .test_fpr_max>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $f1, implicit killed $f2, implicit killed $f3, implicit killed $f4, implicit killed $f5, implicit killed $f6, implicit killed $f7, implicit killed $f8, implicit killed $f9, implicit killed $f10, implicit killed $f11, implicit killed $f12, implicit killed $f13, implicit $x2, implicit-def $r1
-; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+; 64BIT-NEXT: ADJCALLSTACKUP 152, 0, implicit-def dead $r1, implicit $r1
+
+; ASM64PWR4:       stdu 1, -160(1)
+; ASM64PWR4-NEXT:  ld [[REG:[0-9]+]], LC2(2)
+; ASM64PWR4-NEXT:  lfd 1, 0([[REG]])
+; ASM64PWR4-DAG:   stfd 1, 112(1)
+; ASM64PWR4-DAG:   stfd 1, 120(1)
+; ASM64PWR4-DAG:   stfd 1, 128(1)
+; ASM64PWR4-DAG:   stfd 1, 136(1)
+; ASM64PWR4-DAG:   stfd 1, 144(1)
+; ASM64PWR4-DAG:   fmr 2, 1
+; ASM64PWR4-DAG:   fmr 3, 1
+; ASM64PWR4-DAG:   fmr 4, 1
+; ASM64PWR4-DAG:   fmr 5, 1
+; ASM64PWR4-DAG:   fmr 6, 1
+; ASM64PWR4-DAG:   fmr 7, 1
+; ASM64PWR4-DAG:   fmr 8, 1
+; ASM64PWR4-DAG:   fmr 9, 1
+; ASM64PWR4-DAG:   fmr 10, 1
+; ASM64PWR4-DAG:   fmr 11, 1
+; ASM64PWR4-DAG:   fmr 12, 1
+; ASM64PWR4-DAG:   fmr 13, 1
+; ASM64PWR4-NEXT:  bl .test_fpr_max
+; ASM64PWR4-NEXT:  nop
+; ASM64PWR4-NEXT:  addi 1, 1, 160
 
 define double @test_fpr_max(double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10, double %d11, double %d12, double %d13) {
 entry:
@@ -829,8 +897,8 @@ entry:
 ; ASM64PWR4-NEXT: lfd 2, 0([[REG]])
 ; ASM64PWR4-NEXT: li 3, 42
 ; ASM64PWR4-NEXT: stfd 2, 120(1)
-; ASM64PWR4-DAG: ld 4, 112(1)
-; ASM64PWR4-DAG: ld 6, 120(1)
+; ASM64PWR4-DAG:  ld 4, 112(1)
+; ASM64PWR4-DAG:  ld 6, 120(1)
 ; ASM64PWR4-NEXT: bl .test_vararg
 ; ASM64PWR4-NEXT: nop
 
@@ -878,3 +946,323 @@ entry:
 ; ASM64PWR4-NEXT: lwz 4, 124(1)
 ; ASM64PWR4-NEXT: bl .test_vararg
 ; ASM64PWR4-NEXT: nop
+
+ at c = common global i8 0, align 1
+ at si = common global i16 0, align 2
+ at i = common global i32 0, align 4
+ at lli = common global i64 0, align 8
+ at f = common global float 0.000000e+00, align 4
+ at d = common global double 0.000000e+00, align 8
+
+; Basic saving of integral type arguments to the parameter save area.
+define void @call_test_stackarg_int() {
+entry:
+  %0 = load i8, i8* @c, align 1
+  %1 = load i16, i16* @si, align 2
+  %2 = load i32, i32* @i, align 4
+  %3 = load i64, i64* @lli, align 8
+  %4 = load i32, i32* @i, align 4
+  call void @test_stackarg_int(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i8 zeroext %0, i16 signext %1, i32 %2, i64 %3, i32 %4)
+  ret void
+}
+
+declare void @test_stackarg_int(i32, i32, i32, i32, i32, i32, i32, i32, i8 zeroext, i16 signext, i32, i64, i32)
+
+; CHECK-LABEL:     name: call_test_stackarg_int{{.*}}
+
+; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
+; 32BIT-DAG:  renamable $r[[REGCADDR:[0-9]+]] = LWZtoc @c, $r2 :: (load 4 from got)
+; 32BIT-DAG:  renamable $r[[REGC:[0-9]+]] = LBZ 0, killed renamable $r[[REGCADDR]] :: (dereferenceable load 1 from @c)
+; 32BIT-DAG:  renamable $r[[REGSIADDR:[0-9]+]] = LWZtoc @si, $r2 :: (load 4 from got)
+; 32BIT-DAG:  renamable $r[[REGSI:[0-9]+]] = LHA 0, killed renamable $r[[REGSIADDR]] :: (dereferenceable load 2 from @si)
+; 32BIT-DAG:  renamable $r[[REGIADDR:[0-9]+]] = LWZtoc @i, $r2 :: (load 4 from got)
+; 32BIT-DAG:  renamable $r[[REGI:[0-9]+]] = LWZ 0, killed renamable $r[[REGIADDR]] :: (dereferenceable load 4 from @i)
+; 32BIT-DAG:  renamable $r[[REGLLIADDR:[0-9]+]] = LWZtoc @lli, $r2 :: (load 4 from got)
+; 32BIT-DAG:  renamable $r[[REGLLI1:[0-9]+]] = LWZ 0, renamable $r[[REGLLIADDR]] :: (dereferenceable load 4 from @lli, align 8)
+; 32BIT-DAG:  renamable $r[[REGLLI2:[0-9]+]] = LWZ 4, killed renamable $r[[REGLLIADDR]] :: (dereferenceable load 4 from @lli + 4)
+; 32BIT-NEXT: ADJCALLSTACKDOWN 80, 0, implicit-def dead $r1, implicit $r1
+; 32BIT-DAG:  STW killed renamable $r[[REGC]], 56, $r1 :: (store 4)
+; 32BIT-DAG:  STW killed renamable $r[[REGSI]], 60, $r1 :: (store 4)
+; 32BIT-DAG:  STW killed renamable $r[[REGI]], 64, $r1 :: (store 4)
+; 32BIT-DAG:  STW killed renamable $r[[REGLLI1]], 68, $r1 :: (store 4)
+; 32BIT-DAG:  STW killed renamable $r[[REGLLI2]], 72, $r1 :: (store 4)
+; 32BIT-DAG:  STW renamable $r[[REGI]], 76, $r1 :: (store 4)
+; 32BIT-DAG:  $r3 = LI 1
+; 32BIT-DAG:  $r4 = LI 2
+; 32BIT-DAG:  $r5 = LI 3
+; 32BIT-DAG:  $r6 = LI 4
+; 32BIT-DAG:  $r7 = LI 5
+; 32BIT-DAG:  $r8 = LI 6
+; 32BIT-DAG:  $r9 = LI 7
+; 32BIT-DAG:  $r10 = LI 8
+; 32BIT-NEXT: BL_NOP <mcsymbol .test_stackarg_int>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1
+; 32BIT-NEXT: ADJCALLSTACKUP 80, 0, implicit-def dead $r1, implicit $r1
+
+; Basic saving of floating point type arguments to the parameter save area.
+; The float and double arguments will pass in both fpr as well as parameter save area.
+define void @call_test_stackarg_float() {
+entry:
+  %0 = load float, float* @f, align 4
+  %1 = load double, double* @d, align 8
+  call void @test_stackarg_float(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, float %0, double %1)
+  ret void
+}
+
+declare void @test_stackarg_float(i32, i32, i32, i32, i32, i32, i32, i32, float, double)
+
+; CHECK-LABEL:     name:            call_test_stackarg_float
+
+; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
+; 32BIT-DAG:   renamable $r[[REGF:[0-9]+]] = LWZtoc @f, $r2 :: (load 4 from got)
+; 32BIT-DAG:   renamable $f1 = LFS 0, killed renamable $r[[REGF]] :: (dereferenceable load 4 from @f)
+; 32BIT-DAG:   renamable $r[[REGD:[0-9]+]] = LWZtoc @d, $r2 :: (load 4 from got)
+; 32BIT-DAG:   renamable $f2 = LFD 0, killed renamable $r[[REGD]] :: (dereferenceable load 8 from @d)
+; 32BIT-NEXT:  ADJCALLSTACKDOWN 68, 0, implicit-def dead $r1, implicit $r1
+; 32BIT-DAG:   STFS renamable $f1, 56, $r1 :: (store 4)
+; 32BIT-DAG:   STFD renamable $f2, 60, $r1 :: (store 8)
+; 32BIT-DAG:   $r3 = LI 1
+; 32BIT-DAG:   $r4 = LI 2
+; 32BIT-DAG:   $r5 = LI 3
+; 32BIT-DAG:   $r6 = LI 4
+; 32BIT-DAG:   $r7 = LI 5
+; 32BIT-DAG:   $r8 = LI 6
+; 32BIT-DAG:   $r9 = LI 7
+; 32BIT-DAG:   $r10 = LI 8
+; 32BIT-NEXT:  BL_NOP <mcsymbol .test_stackarg_float>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit killed $r10, implicit $f1, implicit $f2, implicit $r2, implicit-def $r1
+; 32BIT-NEXT:  ADJCALLSTACKUP 68, 0, implicit-def dead $r1, implicit $r1
+
+; CHECKASM-LABEL: .call_test_stackarg_float:
+
+; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
+; ASM32PWR4:      stwu 1, -80(1)
+; ASM32PWR4-DAG:  lwz [[REGF:[0-9]+]], LC8(2)
+; ASM32PWR4-DAG:  lfs 1, 0([[REGF]])
+; ASM32PWR4-DAG:  lwz [[REGD:[0-9]+]], LC9(2)
+; ASM32PWR4-DAG:  lfd 2, 0([[REGD:[0-9]+]])
+; ASM32PWR4-DAG:  stfs 1, 56(1)
+; ASM32PWR4-DAG:  stfd 2, 60(1)
+; ASM32PWR4-DAG:  li 3, 1
+; ASM32PWR4-DAG:  li 4, 2
+; ASM32PWR4-DAG:  li 5, 3
+; ASM32PWR4-DAG:  li 6, 4
+; ASM32PWR4-DAG:  li 7, 5
+; ASM32PWR4-DAG:  li 8, 6
+; ASM32PWR4-DAG:  li 9, 7
+; ASM32PWR4-DAG:  li 10, 8
+; ASM32PWR4-NEXT: bl .test_stackarg_float
+; ASM32PWR4-NEXT: nop
+; ASM32PWR4-NEXT: addi 1, 1, 80
+
+; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
+; 64BIT-DAG:   renamable $x[[REGF:[0-9]+]] = LDtoc @f, $x2 :: (load 8 from got)
+; 64BIT-DAG:   renamable $f1 = LFS 0, killed renamable $x[[REGF]] :: (dereferenceable load 4 from @f)
+; 64BIT-DAG:   renamable $x[[REGD:[0-9]+]] = LDtoc @d, $x2 :: (load 8 from got)
+; 64BIT-DAG:   renamable $f2 = LFD 0, killed renamable $x[[REGD]] :: (dereferenceable load 8 from @d)
+; 64BIT-NEXT:  ADJCALLSTACKDOWN 128, 0, implicit-def dead $r1, implicit $r1
+; 64BIT-DAG:   STFS renamable $f1, 112, $x1 :: (store 4)
+; 64BIT-DAG:   STFD renamable $f2, 120, $x1 :: (store 8)
+; 64BIT-DAG:   $x3 = LI8 1
+; 64BIT-DAG:   $x4 = LI8 2
+; 64BIT-DAG:   $x5 = LI8 3
+; 64BIT-DAG:   $x6 = LI8 4
+; 64BIT-DAG:   $x7 = LI8 5
+; 64BIT-DAG:   $x8 = LI8 6
+; 64BIT-DAG:   $x9 = LI8 7
+; 64BIT-DAG:   $x10 = LI8 8
+; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_stackarg_float>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit killed $x5, implicit killed $x6, implicit killed $x7, implicit killed $x8, implicit killed $x9, implicit killed $x10, implicit $f1, implicit $f2, implicit $x2, implicit-def $r1
+; 64BIT-NEXT:  ADJCALLSTACKUP 128, 0, implicit-def dead $r1, implicit $r1
+
+; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
+; ASM64PWR4:      stdu 1, -128(1)
+; ASM64PWR4-DAG:  ld [[REGF:[0-9]+]], LC7(2)
+; ASM64PWR4-DAG:  lfs 1, 0([[REGF]])
+; ASM64PWR4-DAG:  ld [[REGD:[0-9]+]], LC8(2)
+; ASM64PWR4-DAG:  lfd 2, 0([[REGD]])
+; ASM64PWR4-DAG:  stfs 1, 112(1)
+; ASM64PWR4-DAG:  stfd 2, 120(1)
+; ASM64PWR4-DAG:  li 3, 1
+; ASM64PWR4-DAG:  li 4, 2
+; ASM64PWR4-DAG:  li 5, 3
+; ASM64PWR4-DAG:  li 6, 4
+; ASM64PWR4-DAG:  li 7, 5
+; ASM64PWR4-DAG:  li 8, 6
+; ASM64PWR4-DAG:  li 9, 7
+; ASM64PWR4-DAG:  li 10, 8
+; ASM64PWR4-NEXT: bl .test_stackarg_float
+; ASM64PWR4-NEXT: nop
+; ASM64PWR4-NEXT: addi 1, 1, 128
+
+define void @call_test_stackarg_float2() {
+entry:
+  %0 = load double, double* @d, align 8
+  call void (i32, i32, i32, i32, i32, i32, ...) @test_stackarg_float2(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, double %0)
+  ret void
+}
+
+declare void @test_stackarg_float2(i32, i32, i32, i32, i32, i32, ...)
+
+; CHECK-LABEL:     name: call_test_stackarg_float2{{.*}}
+
+; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
+; 32BIT-DAG:   renamable $r[[REG:[0-9]+]] = LWZtoc @d, $r2 :: (load 4 from got)
+; 32BIT-DAG:   renamable $f1 = LFD 0, killed renamable $r[[REG]] :: (dereferenceable load 8 from @d)
+; 32BIT-DAG:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+; 32BIT-DAG:   $r3 = LI 1
+; 32BIT-DAG:   $r4 = LI 2
+; 32BIT-DAG:   $r5 = LI 3
+; 32BIT-DAG:   $r6 = LI 4
+; 32BIT-DAG:   $r7 = LI 5
+; 32BIT-DAG:   $r8 = LI 6
+; 32BIT-DAG:   STFD renamable $f1, 0, %stack.0 :: (store 8 into %stack.0)
+; 32BIT-DAG:   renamable $r9 = LWZ 0, %stack.0 :: (load 4 from %stack.0, align 8)
+; 32BIT-DAG:   renamable $r10 = LWZ 4, %stack.0 :: (load 4 from %stack.0 + 4)
+; 32BIT-NEXT:   BL_NOP <mcsymbol .test_stackarg_float2>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit killed $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit $f1, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1
+; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+
+; CHECKASM-LABEL: .call_test_stackarg_float2:
+
+; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
+; ASM32PWR4:     stwu 1, -64(1)
+; ASM32PWR4-DAG: li 3, 1
+; ASM32PWR4-DAG: li 4, 2
+; ASM32PWR4-DAG: li 5, 3
+; ASM32PWR4-DAG: li 6, 4
+; ASM32PWR4-DAG: li 7, 5
+; ASM32PWR4-DAG: li 8, 6
+; ASM32PWR4-DAG: lwz [[REG:[0-9]+]], LC9(2)
+; ASM32PWR4-DAG: lfd 1, 0([[REG]])
+; ASM32PWR4-DAG: stfd 1, 56(1)
+; ASM32PWR4-DAG: lwz 9, 56(1)
+; ASM32PWR4-DAG: lwz 10, 60(1)
+; ASM32PWR4-NEXT: bl .test_stackarg_float2
+; ASM32PWR4-NEXT: nop
+; ASM32PWR4-NEXT: addi 1, 1, 64
+
+; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
+; 64BIT-DAG:   renamable $x[[REG:[0-9]+]] = LDtoc @d, $x2 :: (load 8 from got)
+; 64BIT-DAG:   renamable $f1 = LFD 0, killed renamable $x[[REG]] :: (dereferenceable load 8 from @d)
+; 64BIT-DAG:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+; 64BIT-DAG:   $x3 = LI8 1
+; 64BIT-DAG:   $x4 = LI8 2
+; 64BIT-DAG:   $x5 = LI8 3
+; 64BIT-DAG:   $x6 = LI8 4
+; 64BIT-DAG:   $x7 = LI8 5
+; 64BIT-DAG:   $x8 = LI8 6
+; 64BIT-DAG:   STFD renamable $f1, 0, %stack.0 :: (store 8 into %stack.0)
+; 64BIT-DAG:   renamable $x9 = LD 0, %stack.0 :: (load 8 from %stack.0)
+; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_stackarg_float2>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit killed $x4, implicit killed $x5, implicit killed $x6, implicit killed $x7, implicit killed $x8, implicit $f1, implicit $x9, implicit $x2, implicit-def $r1
+; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+
+; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
+; ASM64PWR4:     stdu 1, -128(1)
+; ASM64PWR4-DAG: li 3, 1
+; ASM64PWR4-DAG: li 4, 2
+; ASM64PWR4-DAG: li 5, 3
+; ASM64PWR4-DAG: li 6, 4
+; ASM64PWR4-DAG: li 7, 5
+; ASM64PWR4-DAG: li 8, 6
+; ASM64PWR4-DAG: ld [[REG:[0-9]+]], LC8(2)
+; ASM64PWR4-DAG: lfd 1, 0([[REG]])
+; ASM64PWR4-DAG: stfd 1, 120(1)
+; ASM64PWR4-DAG: ld 9, 120(1)
+; ASM64PWR4-NEXT: bl .test_stackarg_float2
+; ASM64PWR4-NEXT: nop
+; ASM64PWR4-NEXT: addi 1, 1, 128
+
+; A double arg will pass on the stack in PPC32 if there is only one available GPR.
+define void @call_test_stackarg_float3() {
+entry:
+  %0 = load double, double* @d, align 8
+  %1 = load float, float* @f, align 4
+  call void (i32, i32, i32, i32, i32, i32, i32, ...) @test_stackarg_float3(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, double %0, float %1)
+  ret void
+}
+
+declare void @test_stackarg_float3(i32, i32, i32, i32, i32, i32, i32, ...)
+
+; CHECK-LABEL:     name: call_test_stackarg_float3{{.*}}
+
+; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
+; In 32-bit the double arg is written to memory because it cannot be fully stored in the last 32-bit GPR.
+; 32BIT-DAG:   renamable $r[[REGD:[0-9]+]] = LWZtoc @d, $r2 :: (load 4 from got)
+; 32BIT-DAG:   renamable $f1 = LFD 0, killed renamable $r[[REGD]] :: (dereferenceable load 8 from @d)
+; 32BIT-DAG:   renamable $r[[REGF:[0-9]+]] = LWZtoc @f, $r2 :: (load 4 from got)
+; 32BIT-DAG:   renamable $f2 = LFS 0, killed renamable $r[[REGF]] :: (dereferenceable load 4 from @f)
+; 32BIT-DAG:   ADJCALLSTACKDOWN 64, 0, implicit-def dead $r1, implicit $r1
+; 32BIT-DAG:   STFD renamable $f1, 52, $r1 :: (store 8)
+; 32BIT-DAG:   STFS renamable $f2, 60, $r1 :: (store 4)
+; 32BIT-DAG:   $r3 = LI 1
+; 32BIT-DAG:   $r4 = LI 2
+; 32BIT-DAG:   $r5 = LI 3
+; 32BIT-DAG:   $r6 = LI 4
+; 32BIT-DAG:   $r7 = LI 5
+; 32BIT-DAG:   $r8 = LI 6
+; 32BIT-DAG:   $r9 = LI 7
+; 32BIT-DAG:   STFD renamable $f1, 0, %stack.0 :: (store 8 into %stack.0)
+; 32BIT-DAG:   renamable $r10 = LWZ 0, %stack.0 :: (load 4 from %stack.0, align 8)
+; 32BIT-NEXT:  BL_NOP <mcsymbol .test_stackarg_float3>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit killed $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit $f1, implicit $r10, implicit $f2, implicit $r2, implicit-def $r1
+; 32BIT-NEXT:  ADJCALLSTACKUP 64, 0, implicit-def dead $r1, implicit $r1
+
+; CHECKASM-LABEL: .call_test_stackarg_float3:
+
+; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
+; ASM32PWR4:       stwu 1, -80(1)
+; ASM32PWR4-DAG:   lwz [[REGD:[0-9]+]], LC9(2)
+; ASM32PWR4-DAG:   lfd 1, 0([[REGD]])
+; ASM32PWR4-DAG:   lwz [[REGF:[0-9]+]], LC8(2)
+; ASM32PWR4-DAG:   lfs 2, 0([[REGF]])
+; ASM32PWR4-DAG:   stfd 1, 52(1)
+; ASM32PWR4-DAG:   stfs 2, 60(1)
+; ASM32PWR4-DAG:   li 3, 1
+; ASM32PWR4-DAG:   li 4, 2
+; ASM32PWR4-DAG:   li 5, 3
+; ASM32PWR4-DAG:   li 6, 4
+; ASM32PWR4-DAG:   li 7, 5
+; ASM32PWR4-DAG:   li 8, 6
+; ASM32PWR4-DAG:   li 9, 7
+; ASM32PWR4-DAG:   stfd 1, 72(1)
+; ASM32PWR4-DAG:   lwz 10, 72(1)
+; ASM32PWR4-NEXT:  bl .test_stackarg_float3
+; ASM32PWR4-NEXT:  nop
+; ASM32PWR4-NEXT:  addi 1, 1, 80
+
+; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
+; In 64-bit the double arg is not written to memory because it is fully stored in the last 64-bit GPR.
+; 64BIT-DAG:   renamable $x[[REGD:[0-9]+]] = LDtoc @d, $x2 :: (load 8 from got)
+; 64BIT-DAG:   renamable $f1 = LFD 0, killed renamable $x[[REGD]] :: (dereferenceable load 8 from @d)
+; 64BIT-DAG:   renamable $x[[REGF:[0-9]+]] = LDtoc @f, $x2 :: (load 8 from got)
+; 64BIT-DAG:   renamable $f2 = LFS 0, killed renamable $x[[REGF]] :: (dereferenceable load 4 from @f)
+; 64BIT-DAG:   ADJCALLSTACKDOWN 120, 0, implicit-def dead $r1, implicit $r1
+; 64BIT-DAG:   STFS renamable $f2, 112, $x1 :: (store 4)
+; 64BIT-DAG:   $x3 = LI8 1
+; 64BIT-DAG:   $x4 = LI8 2
+; 64BIT-DAG:   $x5 = LI8 3
+; 64BIT-DAG:   $x6 = LI8 4
+; 64BIT-DAG:   $x7 = LI8 5
+; 64BIT-DAG:   $x8 = LI8 6
+; 64BIT-DAG:   $x9 = LI8 7
+; 64BIT-DAG:   STFD renamable $f1, 0, %stack.0 :: (store 8 into %stack.0)
+; 64BIT-DAG:   renamable $x10 = LD 0, %stack.0 :: (load 8 from %stack.0)
+; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_stackarg_float3>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit killed $x4, implicit killed $x5, implicit killed $x6, implicit killed $x7, implicit killed $x8, implicit killed $x9, implicit $f1, implicit $x10, implicit $f2, implicit $x2, implicit-def $r1
+
+; 64BIT-NEXT: ADJCALLSTACKUP 120, 0, implicit-def dead $r1, implicit $r1
+
+; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
+; ASM64PWR4:       stdu 1, -128(1)
+; ASM64PWR4-DAG:   ld [[REGD:[0-9]+]], LC8(2)
+; ASM64PWR4-DAG:   lfd 1, 0([[REGD]])
+; ASM64PWR4-DAG:   ld [[REGF:[0-9]+]], LC7(2)
+; ASM64PWR4-DAG:   lfs 2, 0([[REGF]])
+; ASM64PWR4-DAG:   stfs 2, 112(1)
+; ASM64PWR4-DAG:   li 3, 1
+; ASM64PWR4-DAG:   li 4, 2
+; ASM64PWR4-DAG:   li 5, 3
+; ASM64PWR4-DAG:   li 6, 4
+; ASM64PWR4-DAG:   li 7, 5
+; ASM64PWR4-DAG:   li 8, 6
+; ASM64PWR4-DAG:   li 9, 7
+; ASM64PWR4-DAG:   stfd 1, 120(1)
+; ASM64PWR4-DAG:   ld 10, 120(1)
+; ASM64PWR4-NEXT:  bl .test_stackarg_float3
+; ASM64PWR4-NEXT:  nop
+; ASM64PWR4-NEXT:  addi 1, 1, 128

diff  --git a/llvm/test/CodeGen/PowerPC/aix-stackargs.ll b/llvm/test/CodeGen/PowerPC/aix-stackargs.ll
deleted file mode 100644
index 6cac691e071b..000000000000
--- a/llvm/test/CodeGen/PowerPC/aix-stackargs.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: not llc -mtriple powerpc-ibm-aix-xcoff < %s 2>&1 | FileCheck %s
-; RUN: not llc -mtriple powerpc64-ibm-aix-xcoff < %s 2>&1 | FileCheck %s
-
-define void @bar() {
-entry:
-  call void @foo(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9)
-  ret void
-}
-
-declare void @foo(i32, i32, i32, i32, i32, i32, i32, i32, i32)
-
-; CHECK: LLVM ERROR: Handling of placing parameters on the stack is unimplemented!