[llvm] [RISCV] Remove x7 from fastcc list. (PR #96729)

Yeting Kuo via llvm-commits llvm-commits at lists.llvm.org
Mon Jul 15 23:19:30 PDT 2024


https://github.com/yetingk updated https://github.com/llvm/llvm-project/pull/96729

>From 2221e45961579e56db574b02f4e057aa596a51d8 Mon Sep 17 00:00:00 2001
From: Yeting Kuo <yeting.kuo at sifive.com>
Date: Tue, 25 Jun 2024 20:30:50 -0700
Subject: [PATCH 1/2] [RISCV] Remove x7 from fastcc list.

Like #93321, this patch also tries to solve the conflict usage of x7 for
fastcc and Zicfilp. But this patch removes x7 from fastcc directly. Its
purpose is to reduce the code complexity of #93321, and we also found
that it at most increase 0.02% instruction count for most benchmarks and it
might be benefit to overall benchmarks.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |    7 +-
 llvm/test/CodeGen/RISCV/fastcc-int.ll         |   34 +-
 .../CodeGen/RISCV/fastcc-without-f-reg.ll     | 1196 +++++++++--------
 .../CodeGen/RISCV/rvv/calling-conv-fastcc.ll  |   68 +-
 .../rvv/fixed-vectors-calling-conv-fastcc.ll  |   25 +-
 5 files changed, 679 insertions(+), 651 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 8b5e56bff4097..2d7fcd0225c51 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -18886,13 +18886,12 @@ static ArrayRef<MCPhysReg> getFastCCArgGPRs(const RISCVABI::ABI ABI) {
   // for save-restore libcall, so we don't use them.
   static const MCPhysReg FastCCIGPRs[] = {
       RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13, RISCV::X14,
-      RISCV::X15, RISCV::X16, RISCV::X17, RISCV::X7,  RISCV::X28,
-      RISCV::X29, RISCV::X30, RISCV::X31};
+      RISCV::X15, RISCV::X16, RISCV::X17, RISCV::X28, RISCV::X29,
+      RISCV::X30, RISCV::X31};
 
   // The GPRs used for passing arguments in the FastCC when using ILP32E/ILP64E.
   static const MCPhysReg FastCCEGPRs[] = {RISCV::X10, RISCV::X11, RISCV::X12,
-                                          RISCV::X13, RISCV::X14, RISCV::X15,
-                                          RISCV::X7};
+                                          RISCV::X13, RISCV::X14, RISCV::X15};
 
   if (ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E)
     return ArrayRef(FastCCEGPRs);
diff --git a/llvm/test/CodeGen/RISCV/fastcc-int.ll b/llvm/test/CodeGen/RISCV/fastcc-int.ll
index e4c41a1aa890f..75046b701b235 100644
--- a/llvm/test/CodeGen/RISCV/fastcc-int.ll
+++ b/llvm/test/CodeGen/RISCV/fastcc-int.ll
@@ -32,16 +32,17 @@ define i32 @caller(<16 x i32> %A) nounwind {
 ; RV32-NEXT:    lw a5, 20(a0)
 ; RV32-NEXT:    lw a6, 24(a0)
 ; RV32-NEXT:    lw a7, 28(a0)
-; RV32-NEXT:    lw t2, 32(a0)
-; RV32-NEXT:    lw t3, 36(a0)
-; RV32-NEXT:    lw t4, 40(a0)
-; RV32-NEXT:    lw t5, 44(a0)
-; RV32-NEXT:    lw t6, 48(a0)
-; RV32-NEXT:    lw t1, 52(a0)
+; RV32-NEXT:    lw t3, 32(a0)
+; RV32-NEXT:    lw t4, 36(a0)
+; RV32-NEXT:    lw t5, 40(a0)
+; RV32-NEXT:    lw t6, 44(a0)
+; RV32-NEXT:    lw t1, 48(a0)
+; RV32-NEXT:    lw t2, 52(a0)
 ; RV32-NEXT:    lw s0, 56(a0)
 ; RV32-NEXT:    lw a0, 60(a0)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    sw s0, 4(sp)
+; RV32-NEXT:    sw a0, 12(sp)
+; RV32-NEXT:    sw s0, 8(sp)
+; RV32-NEXT:    sw t2, 4(sp)
 ; RV32-NEXT:    sw t1, 0(sp)
 ; RV32-NEXT:    mv a0, t0
 ; RV32-NEXT:    call callee
@@ -63,16 +64,17 @@ define i32 @caller(<16 x i32> %A) nounwind {
 ; RV64-NEXT:    ld a5, 40(a0)
 ; RV64-NEXT:    ld a6, 48(a0)
 ; RV64-NEXT:    ld a7, 56(a0)
-; RV64-NEXT:    ld t2, 64(a0)
-; RV64-NEXT:    ld t3, 72(a0)
-; RV64-NEXT:    ld t4, 80(a0)
-; RV64-NEXT:    ld t5, 88(a0)
-; RV64-NEXT:    ld t6, 96(a0)
-; RV64-NEXT:    ld t1, 104(a0)
+; RV64-NEXT:    ld t3, 64(a0)
+; RV64-NEXT:    ld t4, 72(a0)
+; RV64-NEXT:    ld t5, 80(a0)
+; RV64-NEXT:    ld t6, 88(a0)
+; RV64-NEXT:    ld t1, 96(a0)
+; RV64-NEXT:    ld t2, 104(a0)
 ; RV64-NEXT:    ld s0, 112(a0)
 ; RV64-NEXT:    ld a0, 120(a0)
-; RV64-NEXT:    sd a0, 16(sp)
-; RV64-NEXT:    sd s0, 8(sp)
+; RV64-NEXT:    sd a0, 24(sp)
+; RV64-NEXT:    sd s0, 16(sp)
+; RV64-NEXT:    sd t2, 8(sp)
 ; RV64-NEXT:    sd t1, 0(sp)
 ; RV64-NEXT:    mv a0, t0
 ; RV64-NEXT:    call callee
diff --git a/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll b/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll
index a44d31dff09cc..1dbb060fc35fa 100644
--- a/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll
+++ b/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll
@@ -288,29 +288,30 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZHINX32-NEXT:    lh t2, 196(sp)
 ; ZHINX32-NEXT:    lh t1, 200(sp)
 ; ZHINX32-NEXT:    lh t0, 204(sp)
-; ZHINX32-NEXT:    sh t0, 36(sp)
-; ZHINX32-NEXT:    sh t1, 34(sp)
-; ZHINX32-NEXT:    sh t2, 32(sp)
-; ZHINX32-NEXT:    sh t3, 30(sp)
-; ZHINX32-NEXT:    sh ra, 28(sp)
-; ZHINX32-NEXT:    sh s11, 26(sp)
-; ZHINX32-NEXT:    sh s10, 24(sp)
-; ZHINX32-NEXT:    sh s9, 22(sp)
-; ZHINX32-NEXT:    sh s8, 20(sp)
-; ZHINX32-NEXT:    sh s7, 18(sp)
-; ZHINX32-NEXT:    sh s6, 16(sp)
-; ZHINX32-NEXT:    sh s5, 14(sp)
-; ZHINX32-NEXT:    sh s4, 12(sp)
-; ZHINX32-NEXT:    sh s3, 10(sp)
-; ZHINX32-NEXT:    sh s2, 8(sp)
-; ZHINX32-NEXT:    sh s1, 6(sp)
-; ZHINX32-NEXT:    sh s0, 4(sp)
-; ZHINX32-NEXT:    sh t4, 2(sp)
-; ZHINX32-NEXT:    sh t5, 0(sp)
-; ZHINX32-NEXT:    lw t2, 56(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw t3, 52(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw t4, 48(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw t5, 44(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    sh t0, 38(sp)
+; ZHINX32-NEXT:    sh t1, 36(sp)
+; ZHINX32-NEXT:    sh t2, 34(sp)
+; ZHINX32-NEXT:    sh t3, 32(sp)
+; ZHINX32-NEXT:    sh ra, 30(sp)
+; ZHINX32-NEXT:    sh s11, 28(sp)
+; ZHINX32-NEXT:    sh s10, 26(sp)
+; ZHINX32-NEXT:    sh s9, 24(sp)
+; ZHINX32-NEXT:    sh s8, 22(sp)
+; ZHINX32-NEXT:    sh s7, 20(sp)
+; ZHINX32-NEXT:    sh s6, 18(sp)
+; ZHINX32-NEXT:    sh s5, 16(sp)
+; ZHINX32-NEXT:    sh s4, 14(sp)
+; ZHINX32-NEXT:    sh s3, 12(sp)
+; ZHINX32-NEXT:    sh s2, 10(sp)
+; ZHINX32-NEXT:    sh s1, 8(sp)
+; ZHINX32-NEXT:    sh s0, 6(sp)
+; ZHINX32-NEXT:    sh t4, 4(sp)
+; ZHINX32-NEXT:    sh t5, 2(sp)
+; ZHINX32-NEXT:    sh t6, 0(sp)
+; ZHINX32-NEXT:    lw t3, 56(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw t4, 52(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw t5, 48(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw t6, 44(sp) # 4-byte Folded Reload
 ; ZHINX32-NEXT:    call callee_half_32
 ; ZHINX32-NEXT:    lw ra, 108(sp) # 4-byte Folded Reload
 ; ZHINX32-NEXT:    lw s0, 104(sp) # 4-byte Folded Reload
@@ -372,29 +373,30 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZHINX64-NEXT:    lh t2, 344(sp)
 ; ZHINX64-NEXT:    lh t1, 352(sp)
 ; ZHINX64-NEXT:    lh t0, 360(sp)
-; ZHINX64-NEXT:    sh t0, 36(sp)
-; ZHINX64-NEXT:    sh t1, 34(sp)
-; ZHINX64-NEXT:    sh t2, 32(sp)
-; ZHINX64-NEXT:    sh t3, 30(sp)
-; ZHINX64-NEXT:    sh ra, 28(sp)
-; ZHINX64-NEXT:    sh s11, 26(sp)
-; ZHINX64-NEXT:    sh s10, 24(sp)
-; ZHINX64-NEXT:    sh s9, 22(sp)
-; ZHINX64-NEXT:    sh s8, 20(sp)
-; ZHINX64-NEXT:    sh s7, 18(sp)
-; ZHINX64-NEXT:    sh s6, 16(sp)
-; ZHINX64-NEXT:    sh s5, 14(sp)
-; ZHINX64-NEXT:    sh s4, 12(sp)
-; ZHINX64-NEXT:    sh s3, 10(sp)
-; ZHINX64-NEXT:    sh s2, 8(sp)
-; ZHINX64-NEXT:    sh s1, 6(sp)
-; ZHINX64-NEXT:    sh s0, 4(sp)
-; ZHINX64-NEXT:    sh t4, 2(sp)
-; ZHINX64-NEXT:    sh t5, 0(sp)
-; ZHINX64-NEXT:    ld t2, 64(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld t3, 56(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld t4, 48(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld t5, 40(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    sh t0, 38(sp)
+; ZHINX64-NEXT:    sh t1, 36(sp)
+; ZHINX64-NEXT:    sh t2, 34(sp)
+; ZHINX64-NEXT:    sh t3, 32(sp)
+; ZHINX64-NEXT:    sh ra, 30(sp)
+; ZHINX64-NEXT:    sh s11, 28(sp)
+; ZHINX64-NEXT:    sh s10, 26(sp)
+; ZHINX64-NEXT:    sh s9, 24(sp)
+; ZHINX64-NEXT:    sh s8, 22(sp)
+; ZHINX64-NEXT:    sh s7, 20(sp)
+; ZHINX64-NEXT:    sh s6, 18(sp)
+; ZHINX64-NEXT:    sh s5, 16(sp)
+; ZHINX64-NEXT:    sh s4, 14(sp)
+; ZHINX64-NEXT:    sh s3, 12(sp)
+; ZHINX64-NEXT:    sh s2, 10(sp)
+; ZHINX64-NEXT:    sh s1, 8(sp)
+; ZHINX64-NEXT:    sh s0, 6(sp)
+; ZHINX64-NEXT:    sh t4, 4(sp)
+; ZHINX64-NEXT:    sh t5, 2(sp)
+; ZHINX64-NEXT:    sh t6, 0(sp)
+; ZHINX64-NEXT:    ld t3, 64(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld t4, 56(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld t5, 48(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld t6, 40(sp) # 8-byte Folded Reload
 ; ZHINX64-NEXT:    call callee_half_32
 ; ZHINX64-NEXT:    ld ra, 168(sp) # 8-byte Folded Reload
 ; ZHINX64-NEXT:    ld s0, 160(sp) # 8-byte Folded Reload
@@ -414,38 +416,38 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ;
 ; ZFINX32-LABEL: caller_half_32:
 ; ZFINX32:       # %bb.0:
-; ZFINX32-NEXT:    addi sp, sp, -144
-; ZFINX32-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    addi sp, sp, -160
+; ZFINX32-NEXT:    sw ra, 156(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s0, 152(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s1, 148(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s2, 144(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s3, 140(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s4, 136(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s5, 132(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s6, 128(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s7, 124(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s8, 120(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s9, 116(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s10, 112(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s11, 108(sp) # 4-byte Folded Spill
 ; ZFINX32-NEXT:    lw t0, 0(a0)
 ; ZFINX32-NEXT:    lw a1, 4(a0)
-; ZFINX32-NEXT:    sw a1, 88(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw a1, 104(sp) # 4-byte Folded Spill
 ; ZFINX32-NEXT:    lw a1, 8(a0)
-; ZFINX32-NEXT:    sw a1, 84(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw a1, 100(sp) # 4-byte Folded Spill
 ; ZFINX32-NEXT:    lw a1, 12(a0)
-; ZFINX32-NEXT:    sw a1, 80(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw a1, 96(sp) # 4-byte Folded Spill
 ; ZFINX32-NEXT:    lw a1, 16(a0)
-; ZFINX32-NEXT:    sw a1, 76(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw a1, 92(sp) # 4-byte Folded Spill
 ; ZFINX32-NEXT:    lw a5, 20(a0)
 ; ZFINX32-NEXT:    lw a6, 24(a0)
 ; ZFINX32-NEXT:    lw a7, 28(a0)
-; ZFINX32-NEXT:    lw t2, 32(a0)
-; ZFINX32-NEXT:    lw t3, 36(a0)
-; ZFINX32-NEXT:    lw t4, 40(a0)
-; ZFINX32-NEXT:    lw t5, 44(a0)
-; ZFINX32-NEXT:    lw t6, 48(a0)
-; ZFINX32-NEXT:    lw t1, 52(a0)
+; ZFINX32-NEXT:    lw t3, 32(a0)
+; ZFINX32-NEXT:    lw t4, 36(a0)
+; ZFINX32-NEXT:    lw t5, 40(a0)
+; ZFINX32-NEXT:    lw t6, 44(a0)
+; ZFINX32-NEXT:    lw t1, 48(a0)
+; ZFINX32-NEXT:    lw t2, 52(a0)
 ; ZFINX32-NEXT:    lw s0, 56(a0)
 ; ZFINX32-NEXT:    lw s1, 60(a0)
 ; ZFINX32-NEXT:    lw s2, 64(a0)
@@ -464,83 +466,84 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZFINX32-NEXT:    lw a2, 116(a0)
 ; ZFINX32-NEXT:    lw a1, 120(a0)
 ; ZFINX32-NEXT:    lw a0, 124(a0)
-; ZFINX32-NEXT:    sw a0, 72(sp)
-; ZFINX32-NEXT:    sw a1, 68(sp)
-; ZFINX32-NEXT:    sw a2, 64(sp)
-; ZFINX32-NEXT:    sw a3, 60(sp)
-; ZFINX32-NEXT:    sw a4, 56(sp)
-; ZFINX32-NEXT:    sw ra, 52(sp)
-; ZFINX32-NEXT:    sw s11, 48(sp)
-; ZFINX32-NEXT:    sw s10, 44(sp)
-; ZFINX32-NEXT:    sw s9, 40(sp)
-; ZFINX32-NEXT:    sw s8, 36(sp)
-; ZFINX32-NEXT:    sw s7, 32(sp)
-; ZFINX32-NEXT:    sw s6, 28(sp)
-; ZFINX32-NEXT:    sw s5, 24(sp)
-; ZFINX32-NEXT:    sw s4, 20(sp)
-; ZFINX32-NEXT:    sw s3, 16(sp)
-; ZFINX32-NEXT:    sw s2, 12(sp)
-; ZFINX32-NEXT:    sw s1, 8(sp)
-; ZFINX32-NEXT:    sw s0, 4(sp)
+; ZFINX32-NEXT:    sw a0, 76(sp)
+; ZFINX32-NEXT:    sw a1, 72(sp)
+; ZFINX32-NEXT:    sw a2, 68(sp)
+; ZFINX32-NEXT:    sw a3, 64(sp)
+; ZFINX32-NEXT:    sw a4, 60(sp)
+; ZFINX32-NEXT:    sw ra, 56(sp)
+; ZFINX32-NEXT:    sw s11, 52(sp)
+; ZFINX32-NEXT:    sw s10, 48(sp)
+; ZFINX32-NEXT:    sw s9, 44(sp)
+; ZFINX32-NEXT:    sw s8, 40(sp)
+; ZFINX32-NEXT:    sw s7, 36(sp)
+; ZFINX32-NEXT:    sw s6, 32(sp)
+; ZFINX32-NEXT:    sw s5, 28(sp)
+; ZFINX32-NEXT:    sw s4, 24(sp)
+; ZFINX32-NEXT:    sw s3, 20(sp)
+; ZFINX32-NEXT:    sw s2, 16(sp)
+; ZFINX32-NEXT:    sw s1, 12(sp)
+; ZFINX32-NEXT:    sw s0, 8(sp)
+; ZFINX32-NEXT:    sw t2, 4(sp)
 ; ZFINX32-NEXT:    sw t1, 0(sp)
 ; ZFINX32-NEXT:    mv a0, t0
-; ZFINX32-NEXT:    lw a1, 88(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw a2, 84(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw a3, 80(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw a4, 76(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw a1, 104(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw a2, 100(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw a3, 96(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw a4, 92(sp) # 4-byte Folded Reload
 ; ZFINX32-NEXT:    call callee_half_32
 ; ZFINX32-NEXT:    lui a1, 1048560
 ; ZFINX32-NEXT:    or a0, a0, a1
-; ZFINX32-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    addi sp, sp, 144
+; ZFINX32-NEXT:    lw ra, 156(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s0, 152(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s1, 148(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s2, 144(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s3, 140(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s4, 136(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s5, 132(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s6, 128(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s7, 124(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s8, 120(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s9, 116(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s10, 112(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s11, 108(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    addi sp, sp, 160
 ; ZFINX32-NEXT:    ret
 ;
 ; ZFINX64-LABEL: caller_half_32:
 ; ZFINX64:       # %bb.0:
-; ZFINX64-NEXT:    addi sp, sp, -288
-; ZFINX64-NEXT:    sd ra, 280(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s0, 272(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s1, 264(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s2, 256(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s3, 248(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s4, 240(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s5, 232(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s6, 224(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s7, 216(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s8, 208(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s9, 200(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s10, 192(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    sd s11, 184(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    addi sp, sp, -304
+; ZFINX64-NEXT:    sd ra, 296(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s0, 288(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s1, 280(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s2, 272(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s3, 264(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s4, 256(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s5, 248(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s6, 240(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s7, 232(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s8, 224(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s9, 216(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s10, 208(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd s11, 200(sp) # 8-byte Folded Spill
 ; ZFINX64-NEXT:    ld t0, 0(a0)
 ; ZFINX64-NEXT:    ld a1, 8(a0)
-; ZFINX64-NEXT:    sd a1, 176(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd a1, 192(sp) # 8-byte Folded Spill
 ; ZFINX64-NEXT:    ld a1, 16(a0)
-; ZFINX64-NEXT:    sd a1, 168(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd a1, 184(sp) # 8-byte Folded Spill
 ; ZFINX64-NEXT:    ld a1, 24(a0)
-; ZFINX64-NEXT:    sd a1, 160(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd a1, 176(sp) # 8-byte Folded Spill
 ; ZFINX64-NEXT:    ld a1, 32(a0)
-; ZFINX64-NEXT:    sd a1, 152(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    sd a1, 168(sp) # 8-byte Folded Spill
 ; ZFINX64-NEXT:    ld a5, 40(a0)
 ; ZFINX64-NEXT:    ld a6, 48(a0)
 ; ZFINX64-NEXT:    ld a7, 56(a0)
-; ZFINX64-NEXT:    ld t2, 64(a0)
-; ZFINX64-NEXT:    ld t3, 72(a0)
-; ZFINX64-NEXT:    ld t4, 80(a0)
-; ZFINX64-NEXT:    ld t5, 88(a0)
-; ZFINX64-NEXT:    ld t6, 96(a0)
-; ZFINX64-NEXT:    ld t1, 104(a0)
+; ZFINX64-NEXT:    ld t3, 64(a0)
+; ZFINX64-NEXT:    ld t4, 72(a0)
+; ZFINX64-NEXT:    ld t5, 80(a0)
+; ZFINX64-NEXT:    ld t6, 88(a0)
+; ZFINX64-NEXT:    ld t1, 96(a0)
+; ZFINX64-NEXT:    ld t2, 104(a0)
 ; ZFINX64-NEXT:    ld s0, 112(a0)
 ; ZFINX64-NEXT:    ld s1, 120(a0)
 ; ZFINX64-NEXT:    ld s2, 128(a0)
@@ -559,83 +562,84 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZFINX64-NEXT:    ld a2, 232(a0)
 ; ZFINX64-NEXT:    ld a1, 240(a0)
 ; ZFINX64-NEXT:    ld a0, 248(a0)
-; ZFINX64-NEXT:    sd a0, 144(sp)
-; ZFINX64-NEXT:    sd a1, 136(sp)
-; ZFINX64-NEXT:    sd a2, 128(sp)
-; ZFINX64-NEXT:    sd a3, 120(sp)
-; ZFINX64-NEXT:    sd a4, 112(sp)
-; ZFINX64-NEXT:    sd ra, 104(sp)
-; ZFINX64-NEXT:    sd s11, 96(sp)
-; ZFINX64-NEXT:    sd s10, 88(sp)
-; ZFINX64-NEXT:    sd s9, 80(sp)
-; ZFINX64-NEXT:    sd s8, 72(sp)
-; ZFINX64-NEXT:    sd s7, 64(sp)
-; ZFINX64-NEXT:    sd s6, 56(sp)
-; ZFINX64-NEXT:    sd s5, 48(sp)
-; ZFINX64-NEXT:    sd s4, 40(sp)
-; ZFINX64-NEXT:    sd s3, 32(sp)
-; ZFINX64-NEXT:    sd s2, 24(sp)
-; ZFINX64-NEXT:    sd s1, 16(sp)
-; ZFINX64-NEXT:    sd s0, 8(sp)
+; ZFINX64-NEXT:    sd a0, 152(sp)
+; ZFINX64-NEXT:    sd a1, 144(sp)
+; ZFINX64-NEXT:    sd a2, 136(sp)
+; ZFINX64-NEXT:    sd a3, 128(sp)
+; ZFINX64-NEXT:    sd a4, 120(sp)
+; ZFINX64-NEXT:    sd ra, 112(sp)
+; ZFINX64-NEXT:    sd s11, 104(sp)
+; ZFINX64-NEXT:    sd s10, 96(sp)
+; ZFINX64-NEXT:    sd s9, 88(sp)
+; ZFINX64-NEXT:    sd s8, 80(sp)
+; ZFINX64-NEXT:    sd s7, 72(sp)
+; ZFINX64-NEXT:    sd s6, 64(sp)
+; ZFINX64-NEXT:    sd s5, 56(sp)
+; ZFINX64-NEXT:    sd s4, 48(sp)
+; ZFINX64-NEXT:    sd s3, 40(sp)
+; ZFINX64-NEXT:    sd s2, 32(sp)
+; ZFINX64-NEXT:    sd s1, 24(sp)
+; ZFINX64-NEXT:    sd s0, 16(sp)
+; ZFINX64-NEXT:    sd t2, 8(sp)
 ; ZFINX64-NEXT:    sd t1, 0(sp)
 ; ZFINX64-NEXT:    mv a0, t0
-; ZFINX64-NEXT:    ld a1, 176(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld a2, 168(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld a3, 160(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld a4, 152(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld a1, 192(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld a2, 184(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld a3, 176(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld a4, 168(sp) # 8-byte Folded Reload
 ; ZFINX64-NEXT:    call callee_half_32
 ; ZFINX64-NEXT:    lui a1, 1048560
 ; ZFINX64-NEXT:    or a0, a0, a1
-; ZFINX64-NEXT:    ld ra, 280(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s0, 272(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s1, 264(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s2, 256(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s3, 248(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s4, 240(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s5, 232(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s6, 224(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s7, 216(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s8, 208(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s9, 200(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s10, 192(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld s11, 184(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    addi sp, sp, 288
+; ZFINX64-NEXT:    ld ra, 296(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s0, 288(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s1, 280(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s2, 272(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s3, 264(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s4, 256(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s5, 248(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s6, 240(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s7, 232(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s8, 224(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s9, 216(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s10, 208(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld s11, 200(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    addi sp, sp, 304
 ; ZFINX64-NEXT:    ret
 ;
 ; ZDINX32-LABEL: caller_half_32:
 ; ZDINX32:       # %bb.0:
-; ZDINX32-NEXT:    addi sp, sp, -144
-; ZDINX32-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    addi sp, sp, -160
+; ZDINX32-NEXT:    sw ra, 156(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s0, 152(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s1, 148(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s2, 144(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s3, 140(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s4, 136(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s5, 132(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s6, 128(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s7, 124(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s8, 120(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s9, 116(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s10, 112(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s11, 108(sp) # 4-byte Folded Spill
 ; ZDINX32-NEXT:    lw t0, 0(a0)
 ; ZDINX32-NEXT:    lw a1, 4(a0)
-; ZDINX32-NEXT:    sw a1, 88(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw a1, 104(sp) # 4-byte Folded Spill
 ; ZDINX32-NEXT:    lw a1, 8(a0)
-; ZDINX32-NEXT:    sw a1, 84(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw a1, 100(sp) # 4-byte Folded Spill
 ; ZDINX32-NEXT:    lw a1, 12(a0)
-; ZDINX32-NEXT:    sw a1, 80(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw a1, 96(sp) # 4-byte Folded Spill
 ; ZDINX32-NEXT:    lw a1, 16(a0)
-; ZDINX32-NEXT:    sw a1, 76(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw a1, 92(sp) # 4-byte Folded Spill
 ; ZDINX32-NEXT:    lw a5, 20(a0)
 ; ZDINX32-NEXT:    lw a6, 24(a0)
 ; ZDINX32-NEXT:    lw a7, 28(a0)
-; ZDINX32-NEXT:    lw t2, 32(a0)
-; ZDINX32-NEXT:    lw t3, 36(a0)
-; ZDINX32-NEXT:    lw t4, 40(a0)
-; ZDINX32-NEXT:    lw t5, 44(a0)
-; ZDINX32-NEXT:    lw t6, 48(a0)
-; ZDINX32-NEXT:    lw t1, 52(a0)
+; ZDINX32-NEXT:    lw t3, 32(a0)
+; ZDINX32-NEXT:    lw t4, 36(a0)
+; ZDINX32-NEXT:    lw t5, 40(a0)
+; ZDINX32-NEXT:    lw t6, 44(a0)
+; ZDINX32-NEXT:    lw t1, 48(a0)
+; ZDINX32-NEXT:    lw t2, 52(a0)
 ; ZDINX32-NEXT:    lw s0, 56(a0)
 ; ZDINX32-NEXT:    lw s1, 60(a0)
 ; ZDINX32-NEXT:    lw s2, 64(a0)
@@ -654,83 +658,84 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZDINX32-NEXT:    lw a2, 116(a0)
 ; ZDINX32-NEXT:    lw a1, 120(a0)
 ; ZDINX32-NEXT:    lw a0, 124(a0)
-; ZDINX32-NEXT:    sw a0, 72(sp)
-; ZDINX32-NEXT:    sw a1, 68(sp)
-; ZDINX32-NEXT:    sw a2, 64(sp)
-; ZDINX32-NEXT:    sw a3, 60(sp)
-; ZDINX32-NEXT:    sw a4, 56(sp)
-; ZDINX32-NEXT:    sw ra, 52(sp)
-; ZDINX32-NEXT:    sw s11, 48(sp)
-; ZDINX32-NEXT:    sw s10, 44(sp)
-; ZDINX32-NEXT:    sw s9, 40(sp)
-; ZDINX32-NEXT:    sw s8, 36(sp)
-; ZDINX32-NEXT:    sw s7, 32(sp)
-; ZDINX32-NEXT:    sw s6, 28(sp)
-; ZDINX32-NEXT:    sw s5, 24(sp)
-; ZDINX32-NEXT:    sw s4, 20(sp)
-; ZDINX32-NEXT:    sw s3, 16(sp)
-; ZDINX32-NEXT:    sw s2, 12(sp)
-; ZDINX32-NEXT:    sw s1, 8(sp)
-; ZDINX32-NEXT:    sw s0, 4(sp)
+; ZDINX32-NEXT:    sw a0, 76(sp)
+; ZDINX32-NEXT:    sw a1, 72(sp)
+; ZDINX32-NEXT:    sw a2, 68(sp)
+; ZDINX32-NEXT:    sw a3, 64(sp)
+; ZDINX32-NEXT:    sw a4, 60(sp)
+; ZDINX32-NEXT:    sw ra, 56(sp)
+; ZDINX32-NEXT:    sw s11, 52(sp)
+; ZDINX32-NEXT:    sw s10, 48(sp)
+; ZDINX32-NEXT:    sw s9, 44(sp)
+; ZDINX32-NEXT:    sw s8, 40(sp)
+; ZDINX32-NEXT:    sw s7, 36(sp)
+; ZDINX32-NEXT:    sw s6, 32(sp)
+; ZDINX32-NEXT:    sw s5, 28(sp)
+; ZDINX32-NEXT:    sw s4, 24(sp)
+; ZDINX32-NEXT:    sw s3, 20(sp)
+; ZDINX32-NEXT:    sw s2, 16(sp)
+; ZDINX32-NEXT:    sw s1, 12(sp)
+; ZDINX32-NEXT:    sw s0, 8(sp)
+; ZDINX32-NEXT:    sw t2, 4(sp)
 ; ZDINX32-NEXT:    sw t1, 0(sp)
 ; ZDINX32-NEXT:    mv a0, t0
-; ZDINX32-NEXT:    lw a1, 88(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw a2, 84(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw a3, 80(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw a4, 76(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw a1, 104(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw a2, 100(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw a3, 96(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw a4, 92(sp) # 4-byte Folded Reload
 ; ZDINX32-NEXT:    call callee_half_32
 ; ZDINX32-NEXT:    lui a1, 1048560
 ; ZDINX32-NEXT:    or a0, a0, a1
-; ZDINX32-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    addi sp, sp, 144
+; ZDINX32-NEXT:    lw ra, 156(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s0, 152(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s1, 148(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s2, 144(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s3, 140(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s4, 136(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s5, 132(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s6, 128(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s7, 124(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s8, 120(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s9, 116(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s10, 112(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s11, 108(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    addi sp, sp, 160
 ; ZDINX32-NEXT:    ret
 ;
 ; ZDINX64-LABEL: caller_half_32:
 ; ZDINX64:       # %bb.0:
-; ZDINX64-NEXT:    addi sp, sp, -288
-; ZDINX64-NEXT:    sd ra, 280(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s0, 272(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s1, 264(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s2, 256(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s3, 248(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s4, 240(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s5, 232(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s6, 224(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s7, 216(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s8, 208(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s9, 200(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s10, 192(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    sd s11, 184(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    addi sp, sp, -304
+; ZDINX64-NEXT:    sd ra, 296(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s0, 288(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s1, 280(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s2, 272(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s3, 264(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s4, 256(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s5, 248(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s6, 240(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s7, 232(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s8, 224(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s9, 216(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s10, 208(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd s11, 200(sp) # 8-byte Folded Spill
 ; ZDINX64-NEXT:    ld t0, 0(a0)
 ; ZDINX64-NEXT:    ld a1, 8(a0)
-; ZDINX64-NEXT:    sd a1, 176(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd a1, 192(sp) # 8-byte Folded Spill
 ; ZDINX64-NEXT:    ld a1, 16(a0)
-; ZDINX64-NEXT:    sd a1, 168(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd a1, 184(sp) # 8-byte Folded Spill
 ; ZDINX64-NEXT:    ld a1, 24(a0)
-; ZDINX64-NEXT:    sd a1, 160(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd a1, 176(sp) # 8-byte Folded Spill
 ; ZDINX64-NEXT:    ld a1, 32(a0)
-; ZDINX64-NEXT:    sd a1, 152(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    sd a1, 168(sp) # 8-byte Folded Spill
 ; ZDINX64-NEXT:    ld a5, 40(a0)
 ; ZDINX64-NEXT:    ld a6, 48(a0)
 ; ZDINX64-NEXT:    ld a7, 56(a0)
-; ZDINX64-NEXT:    ld t2, 64(a0)
-; ZDINX64-NEXT:    ld t3, 72(a0)
-; ZDINX64-NEXT:    ld t4, 80(a0)
-; ZDINX64-NEXT:    ld t5, 88(a0)
-; ZDINX64-NEXT:    ld t6, 96(a0)
-; ZDINX64-NEXT:    ld t1, 104(a0)
+; ZDINX64-NEXT:    ld t3, 64(a0)
+; ZDINX64-NEXT:    ld t4, 72(a0)
+; ZDINX64-NEXT:    ld t5, 80(a0)
+; ZDINX64-NEXT:    ld t6, 88(a0)
+; ZDINX64-NEXT:    ld t1, 96(a0)
+; ZDINX64-NEXT:    ld t2, 104(a0)
 ; ZDINX64-NEXT:    ld s0, 112(a0)
 ; ZDINX64-NEXT:    ld s1, 120(a0)
 ; ZDINX64-NEXT:    ld s2, 128(a0)
@@ -749,47 +754,48 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZDINX64-NEXT:    ld a2, 232(a0)
 ; ZDINX64-NEXT:    ld a1, 240(a0)
 ; ZDINX64-NEXT:    ld a0, 248(a0)
-; ZDINX64-NEXT:    sd a0, 144(sp)
-; ZDINX64-NEXT:    sd a1, 136(sp)
-; ZDINX64-NEXT:    sd a2, 128(sp)
-; ZDINX64-NEXT:    sd a3, 120(sp)
-; ZDINX64-NEXT:    sd a4, 112(sp)
-; ZDINX64-NEXT:    sd ra, 104(sp)
-; ZDINX64-NEXT:    sd s11, 96(sp)
-; ZDINX64-NEXT:    sd s10, 88(sp)
-; ZDINX64-NEXT:    sd s9, 80(sp)
-; ZDINX64-NEXT:    sd s8, 72(sp)
-; ZDINX64-NEXT:    sd s7, 64(sp)
-; ZDINX64-NEXT:    sd s6, 56(sp)
-; ZDINX64-NEXT:    sd s5, 48(sp)
-; ZDINX64-NEXT:    sd s4, 40(sp)
-; ZDINX64-NEXT:    sd s3, 32(sp)
-; ZDINX64-NEXT:    sd s2, 24(sp)
-; ZDINX64-NEXT:    sd s1, 16(sp)
-; ZDINX64-NEXT:    sd s0, 8(sp)
+; ZDINX64-NEXT:    sd a0, 152(sp)
+; ZDINX64-NEXT:    sd a1, 144(sp)
+; ZDINX64-NEXT:    sd a2, 136(sp)
+; ZDINX64-NEXT:    sd a3, 128(sp)
+; ZDINX64-NEXT:    sd a4, 120(sp)
+; ZDINX64-NEXT:    sd ra, 112(sp)
+; ZDINX64-NEXT:    sd s11, 104(sp)
+; ZDINX64-NEXT:    sd s10, 96(sp)
+; ZDINX64-NEXT:    sd s9, 88(sp)
+; ZDINX64-NEXT:    sd s8, 80(sp)
+; ZDINX64-NEXT:    sd s7, 72(sp)
+; ZDINX64-NEXT:    sd s6, 64(sp)
+; ZDINX64-NEXT:    sd s5, 56(sp)
+; ZDINX64-NEXT:    sd s4, 48(sp)
+; ZDINX64-NEXT:    sd s3, 40(sp)
+; ZDINX64-NEXT:    sd s2, 32(sp)
+; ZDINX64-NEXT:    sd s1, 24(sp)
+; ZDINX64-NEXT:    sd s0, 16(sp)
+; ZDINX64-NEXT:    sd t2, 8(sp)
 ; ZDINX64-NEXT:    sd t1, 0(sp)
 ; ZDINX64-NEXT:    mv a0, t0
-; ZDINX64-NEXT:    ld a1, 176(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld a2, 168(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld a3, 160(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld a4, 152(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld a1, 192(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld a2, 184(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld a3, 176(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld a4, 168(sp) # 8-byte Folded Reload
 ; ZDINX64-NEXT:    call callee_half_32
 ; ZDINX64-NEXT:    lui a1, 1048560
 ; ZDINX64-NEXT:    or a0, a0, a1
-; ZDINX64-NEXT:    ld ra, 280(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s0, 272(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s1, 264(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s2, 256(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s3, 248(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s4, 240(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s5, 232(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s6, 224(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s7, 216(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s8, 208(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s9, 200(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s10, 192(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld s11, 184(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    addi sp, sp, 288
+; ZDINX64-NEXT:    ld ra, 296(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s0, 288(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s1, 280(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s2, 272(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s3, 264(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s4, 256(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s5, 248(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s6, 240(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s7, 232(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s8, 224(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s9, 216(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s10, 208(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld s11, 200(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    addi sp, sp, 304
 ; ZDINX64-NEXT:    ret
 	%C = call fastcc half @callee_half_32(<32 x half> %A)
 	ret half %C
@@ -826,86 +832,87 @@ define fastcc float @callee_float_32(<32 x float> %A) nounwind {
 define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZHINX32-LABEL: caller_float_32:
 ; ZHINX32:       # %bb.0:
-; ZHINX32-NEXT:    addi sp, sp, -144
-; ZHINX32-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    lw t0, 144(sp)
-; ZHINX32-NEXT:    sw t0, 88(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    lw t0, 148(sp)
-; ZHINX32-NEXT:    sw t0, 84(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    lw t0, 152(sp)
-; ZHINX32-NEXT:    sw t0, 80(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    lw t0, 156(sp)
-; ZHINX32-NEXT:    sw t0, 76(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    lw t6, 160(sp)
-; ZHINX32-NEXT:    lw t5, 164(sp)
-; ZHINX32-NEXT:    lw t4, 168(sp)
-; ZHINX32-NEXT:    lw s0, 172(sp)
-; ZHINX32-NEXT:    lw s1, 176(sp)
-; ZHINX32-NEXT:    lw s2, 180(sp)
-; ZHINX32-NEXT:    lw s3, 184(sp)
-; ZHINX32-NEXT:    lw s4, 188(sp)
-; ZHINX32-NEXT:    lw s5, 192(sp)
-; ZHINX32-NEXT:    lw s6, 196(sp)
-; ZHINX32-NEXT:    lw s7, 200(sp)
-; ZHINX32-NEXT:    lw s8, 204(sp)
-; ZHINX32-NEXT:    lw s9, 208(sp)
-; ZHINX32-NEXT:    lw s10, 212(sp)
-; ZHINX32-NEXT:    lw s11, 216(sp)
-; ZHINX32-NEXT:    lw ra, 220(sp)
-; ZHINX32-NEXT:    lw t3, 224(sp)
-; ZHINX32-NEXT:    lw t2, 228(sp)
-; ZHINX32-NEXT:    lw t1, 232(sp)
-; ZHINX32-NEXT:    lw t0, 236(sp)
-; ZHINX32-NEXT:    sw t0, 72(sp)
-; ZHINX32-NEXT:    sw t1, 68(sp)
-; ZHINX32-NEXT:    sw t2, 64(sp)
-; ZHINX32-NEXT:    sw t3, 60(sp)
-; ZHINX32-NEXT:    sw ra, 56(sp)
-; ZHINX32-NEXT:    sw s11, 52(sp)
-; ZHINX32-NEXT:    sw s10, 48(sp)
-; ZHINX32-NEXT:    sw s9, 44(sp)
-; ZHINX32-NEXT:    sw s8, 40(sp)
-; ZHINX32-NEXT:    sw s7, 36(sp)
-; ZHINX32-NEXT:    sw s6, 32(sp)
-; ZHINX32-NEXT:    sw s5, 28(sp)
-; ZHINX32-NEXT:    sw s4, 24(sp)
-; ZHINX32-NEXT:    sw s3, 20(sp)
-; ZHINX32-NEXT:    sw s2, 16(sp)
-; ZHINX32-NEXT:    sw s1, 12(sp)
-; ZHINX32-NEXT:    sw s0, 8(sp)
-; ZHINX32-NEXT:    sw t4, 4(sp)
-; ZHINX32-NEXT:    sw t5, 0(sp)
-; ZHINX32-NEXT:    lw t2, 88(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw t3, 84(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw t4, 80(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw t5, 76(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    addi sp, sp, -160
+; ZHINX32-NEXT:    sw ra, 156(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s0, 152(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s1, 148(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s2, 144(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s3, 140(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s4, 136(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s5, 132(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s6, 128(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s7, 124(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s8, 120(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s9, 116(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s10, 112(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s11, 108(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    lw t0, 160(sp)
+; ZHINX32-NEXT:    sw t0, 104(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    lw t0, 164(sp)
+; ZHINX32-NEXT:    sw t0, 100(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    lw t0, 168(sp)
+; ZHINX32-NEXT:    sw t0, 96(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    lw t0, 172(sp)
+; ZHINX32-NEXT:    sw t0, 92(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    lw t6, 176(sp)
+; ZHINX32-NEXT:    lw t5, 180(sp)
+; ZHINX32-NEXT:    lw t4, 184(sp)
+; ZHINX32-NEXT:    lw s0, 188(sp)
+; ZHINX32-NEXT:    lw s1, 192(sp)
+; ZHINX32-NEXT:    lw s2, 196(sp)
+; ZHINX32-NEXT:    lw s3, 200(sp)
+; ZHINX32-NEXT:    lw s4, 204(sp)
+; ZHINX32-NEXT:    lw s5, 208(sp)
+; ZHINX32-NEXT:    lw s6, 212(sp)
+; ZHINX32-NEXT:    lw s7, 216(sp)
+; ZHINX32-NEXT:    lw s8, 220(sp)
+; ZHINX32-NEXT:    lw s9, 224(sp)
+; ZHINX32-NEXT:    lw s10, 228(sp)
+; ZHINX32-NEXT:    lw s11, 232(sp)
+; ZHINX32-NEXT:    lw ra, 236(sp)
+; ZHINX32-NEXT:    lw t3, 240(sp)
+; ZHINX32-NEXT:    lw t2, 244(sp)
+; ZHINX32-NEXT:    lw t1, 248(sp)
+; ZHINX32-NEXT:    lw t0, 252(sp)
+; ZHINX32-NEXT:    sw t0, 76(sp)
+; ZHINX32-NEXT:    sw t1, 72(sp)
+; ZHINX32-NEXT:    sw t2, 68(sp)
+; ZHINX32-NEXT:    sw t3, 64(sp)
+; ZHINX32-NEXT:    sw ra, 60(sp)
+; ZHINX32-NEXT:    sw s11, 56(sp)
+; ZHINX32-NEXT:    sw s10, 52(sp)
+; ZHINX32-NEXT:    sw s9, 48(sp)
+; ZHINX32-NEXT:    sw s8, 44(sp)
+; ZHINX32-NEXT:    sw s7, 40(sp)
+; ZHINX32-NEXT:    sw s6, 36(sp)
+; ZHINX32-NEXT:    sw s5, 32(sp)
+; ZHINX32-NEXT:    sw s4, 28(sp)
+; ZHINX32-NEXT:    sw s3, 24(sp)
+; ZHINX32-NEXT:    sw s2, 20(sp)
+; ZHINX32-NEXT:    sw s1, 16(sp)
+; ZHINX32-NEXT:    sw s0, 12(sp)
+; ZHINX32-NEXT:    sw t4, 8(sp)
+; ZHINX32-NEXT:    sw t5, 4(sp)
+; ZHINX32-NEXT:    sw t6, 0(sp)
+; ZHINX32-NEXT:    lw t3, 104(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw t4, 100(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw t5, 96(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw t6, 92(sp) # 4-byte Folded Reload
 ; ZHINX32-NEXT:    call callee_float_32
-; ZHINX32-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    addi sp, sp, 144
+; ZHINX32-NEXT:    lw ra, 156(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s0, 152(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s1, 148(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s2, 144(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s3, 140(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s4, 136(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s5, 132(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s6, 128(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s7, 124(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s8, 120(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s9, 116(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s10, 112(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s11, 108(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    addi sp, sp, 160
 ; ZHINX32-NEXT:    ret
 ;
 ; ZHINX64-LABEL: caller_float_32:
@@ -952,29 +959,30 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZHINX64-NEXT:    lw t2, 392(sp)
 ; ZHINX64-NEXT:    lw t1, 400(sp)
 ; ZHINX64-NEXT:    lw t0, 408(sp)
-; ZHINX64-NEXT:    sw t0, 72(sp)
-; ZHINX64-NEXT:    sw t1, 68(sp)
-; ZHINX64-NEXT:    sw t2, 64(sp)
-; ZHINX64-NEXT:    sw t3, 60(sp)
-; ZHINX64-NEXT:    sw ra, 56(sp)
-; ZHINX64-NEXT:    sw s11, 52(sp)
-; ZHINX64-NEXT:    sw s10, 48(sp)
-; ZHINX64-NEXT:    sw s9, 44(sp)
-; ZHINX64-NEXT:    sw s8, 40(sp)
-; ZHINX64-NEXT:    sw s7, 36(sp)
-; ZHINX64-NEXT:    sw s6, 32(sp)
-; ZHINX64-NEXT:    sw s5, 28(sp)
-; ZHINX64-NEXT:    sw s4, 24(sp)
-; ZHINX64-NEXT:    sw s3, 20(sp)
-; ZHINX64-NEXT:    sw s2, 16(sp)
-; ZHINX64-NEXT:    sw s1, 12(sp)
-; ZHINX64-NEXT:    sw s0, 8(sp)
-; ZHINX64-NEXT:    sw t4, 4(sp)
-; ZHINX64-NEXT:    sw t5, 0(sp)
-; ZHINX64-NEXT:    ld t2, 112(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld t3, 104(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld t4, 96(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld t5, 88(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    sw t0, 76(sp)
+; ZHINX64-NEXT:    sw t1, 72(sp)
+; ZHINX64-NEXT:    sw t2, 68(sp)
+; ZHINX64-NEXT:    sw t3, 64(sp)
+; ZHINX64-NEXT:    sw ra, 60(sp)
+; ZHINX64-NEXT:    sw s11, 56(sp)
+; ZHINX64-NEXT:    sw s10, 52(sp)
+; ZHINX64-NEXT:    sw s9, 48(sp)
+; ZHINX64-NEXT:    sw s8, 44(sp)
+; ZHINX64-NEXT:    sw s7, 40(sp)
+; ZHINX64-NEXT:    sw s6, 36(sp)
+; ZHINX64-NEXT:    sw s5, 32(sp)
+; ZHINX64-NEXT:    sw s4, 28(sp)
+; ZHINX64-NEXT:    sw s3, 24(sp)
+; ZHINX64-NEXT:    sw s2, 20(sp)
+; ZHINX64-NEXT:    sw s1, 16(sp)
+; ZHINX64-NEXT:    sw s0, 12(sp)
+; ZHINX64-NEXT:    sw t4, 8(sp)
+; ZHINX64-NEXT:    sw t5, 4(sp)
+; ZHINX64-NEXT:    sw t6, 0(sp)
+; ZHINX64-NEXT:    ld t3, 112(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld t4, 104(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld t5, 96(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld t6, 88(sp) # 8-byte Folded Reload
 ; ZHINX64-NEXT:    call callee_float_32
 ; ZHINX64-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; ZHINX64-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
@@ -994,86 +1002,87 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ;
 ; ZFINX32-LABEL: caller_float_32:
 ; ZFINX32:       # %bb.0:
-; ZFINX32-NEXT:    addi sp, sp, -144
-; ZFINX32-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    lw t0, 144(sp)
-; ZFINX32-NEXT:    sw t0, 88(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    lw t0, 148(sp)
-; ZFINX32-NEXT:    sw t0, 84(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    lw t0, 152(sp)
-; ZFINX32-NEXT:    sw t0, 80(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    lw t0, 156(sp)
-; ZFINX32-NEXT:    sw t0, 76(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    lw t6, 160(sp)
-; ZFINX32-NEXT:    lw t5, 164(sp)
-; ZFINX32-NEXT:    lw t4, 168(sp)
-; ZFINX32-NEXT:    lw s0, 172(sp)
-; ZFINX32-NEXT:    lw s1, 176(sp)
-; ZFINX32-NEXT:    lw s2, 180(sp)
-; ZFINX32-NEXT:    lw s3, 184(sp)
-; ZFINX32-NEXT:    lw s4, 188(sp)
-; ZFINX32-NEXT:    lw s5, 192(sp)
-; ZFINX32-NEXT:    lw s6, 196(sp)
-; ZFINX32-NEXT:    lw s7, 200(sp)
-; ZFINX32-NEXT:    lw s8, 204(sp)
-; ZFINX32-NEXT:    lw s9, 208(sp)
-; ZFINX32-NEXT:    lw s10, 212(sp)
-; ZFINX32-NEXT:    lw s11, 216(sp)
-; ZFINX32-NEXT:    lw ra, 220(sp)
-; ZFINX32-NEXT:    lw t3, 224(sp)
-; ZFINX32-NEXT:    lw t2, 228(sp)
-; ZFINX32-NEXT:    lw t1, 232(sp)
-; ZFINX32-NEXT:    lw t0, 236(sp)
-; ZFINX32-NEXT:    sw t0, 72(sp)
-; ZFINX32-NEXT:    sw t1, 68(sp)
-; ZFINX32-NEXT:    sw t2, 64(sp)
-; ZFINX32-NEXT:    sw t3, 60(sp)
-; ZFINX32-NEXT:    sw ra, 56(sp)
-; ZFINX32-NEXT:    sw s11, 52(sp)
-; ZFINX32-NEXT:    sw s10, 48(sp)
-; ZFINX32-NEXT:    sw s9, 44(sp)
-; ZFINX32-NEXT:    sw s8, 40(sp)
-; ZFINX32-NEXT:    sw s7, 36(sp)
-; ZFINX32-NEXT:    sw s6, 32(sp)
-; ZFINX32-NEXT:    sw s5, 28(sp)
-; ZFINX32-NEXT:    sw s4, 24(sp)
-; ZFINX32-NEXT:    sw s3, 20(sp)
-; ZFINX32-NEXT:    sw s2, 16(sp)
-; ZFINX32-NEXT:    sw s1, 12(sp)
-; ZFINX32-NEXT:    sw s0, 8(sp)
-; ZFINX32-NEXT:    sw t4, 4(sp)
-; ZFINX32-NEXT:    sw t5, 0(sp)
-; ZFINX32-NEXT:    lw t2, 88(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw t3, 84(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw t4, 80(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw t5, 76(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    addi sp, sp, -160
+; ZFINX32-NEXT:    sw ra, 156(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s0, 152(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s1, 148(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s2, 144(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s3, 140(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s4, 136(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s5, 132(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s6, 128(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s7, 124(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s8, 120(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s9, 116(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s10, 112(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s11, 108(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    lw t0, 160(sp)
+; ZFINX32-NEXT:    sw t0, 104(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    lw t0, 164(sp)
+; ZFINX32-NEXT:    sw t0, 100(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    lw t0, 168(sp)
+; ZFINX32-NEXT:    sw t0, 96(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    lw t0, 172(sp)
+; ZFINX32-NEXT:    sw t0, 92(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    lw t6, 176(sp)
+; ZFINX32-NEXT:    lw t5, 180(sp)
+; ZFINX32-NEXT:    lw t4, 184(sp)
+; ZFINX32-NEXT:    lw s0, 188(sp)
+; ZFINX32-NEXT:    lw s1, 192(sp)
+; ZFINX32-NEXT:    lw s2, 196(sp)
+; ZFINX32-NEXT:    lw s3, 200(sp)
+; ZFINX32-NEXT:    lw s4, 204(sp)
+; ZFINX32-NEXT:    lw s5, 208(sp)
+; ZFINX32-NEXT:    lw s6, 212(sp)
+; ZFINX32-NEXT:    lw s7, 216(sp)
+; ZFINX32-NEXT:    lw s8, 220(sp)
+; ZFINX32-NEXT:    lw s9, 224(sp)
+; ZFINX32-NEXT:    lw s10, 228(sp)
+; ZFINX32-NEXT:    lw s11, 232(sp)
+; ZFINX32-NEXT:    lw ra, 236(sp)
+; ZFINX32-NEXT:    lw t3, 240(sp)
+; ZFINX32-NEXT:    lw t2, 244(sp)
+; ZFINX32-NEXT:    lw t1, 248(sp)
+; ZFINX32-NEXT:    lw t0, 252(sp)
+; ZFINX32-NEXT:    sw t0, 76(sp)
+; ZFINX32-NEXT:    sw t1, 72(sp)
+; ZFINX32-NEXT:    sw t2, 68(sp)
+; ZFINX32-NEXT:    sw t3, 64(sp)
+; ZFINX32-NEXT:    sw ra, 60(sp)
+; ZFINX32-NEXT:    sw s11, 56(sp)
+; ZFINX32-NEXT:    sw s10, 52(sp)
+; ZFINX32-NEXT:    sw s9, 48(sp)
+; ZFINX32-NEXT:    sw s8, 44(sp)
+; ZFINX32-NEXT:    sw s7, 40(sp)
+; ZFINX32-NEXT:    sw s6, 36(sp)
+; ZFINX32-NEXT:    sw s5, 32(sp)
+; ZFINX32-NEXT:    sw s4, 28(sp)
+; ZFINX32-NEXT:    sw s3, 24(sp)
+; ZFINX32-NEXT:    sw s2, 20(sp)
+; ZFINX32-NEXT:    sw s1, 16(sp)
+; ZFINX32-NEXT:    sw s0, 12(sp)
+; ZFINX32-NEXT:    sw t4, 8(sp)
+; ZFINX32-NEXT:    sw t5, 4(sp)
+; ZFINX32-NEXT:    sw t6, 0(sp)
+; ZFINX32-NEXT:    lw t3, 104(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw t4, 100(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw t5, 96(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw t6, 92(sp) # 4-byte Folded Reload
 ; ZFINX32-NEXT:    call callee_float_32
-; ZFINX32-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    addi sp, sp, 144
+; ZFINX32-NEXT:    lw ra, 156(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s0, 152(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s1, 148(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s2, 144(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s3, 140(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s4, 136(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s5, 132(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s6, 128(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s7, 124(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s8, 120(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s9, 116(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s10, 112(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s11, 108(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    addi sp, sp, 160
 ; ZFINX32-NEXT:    ret
 ;
 ; ZFINX64-LABEL: caller_float_32:
@@ -1120,29 +1129,30 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZFINX64-NEXT:    lw t2, 392(sp)
 ; ZFINX64-NEXT:    lw t1, 400(sp)
 ; ZFINX64-NEXT:    lw t0, 408(sp)
-; ZFINX64-NEXT:    sw t0, 72(sp)
-; ZFINX64-NEXT:    sw t1, 68(sp)
-; ZFINX64-NEXT:    sw t2, 64(sp)
-; ZFINX64-NEXT:    sw t3, 60(sp)
-; ZFINX64-NEXT:    sw ra, 56(sp)
-; ZFINX64-NEXT:    sw s11, 52(sp)
-; ZFINX64-NEXT:    sw s10, 48(sp)
-; ZFINX64-NEXT:    sw s9, 44(sp)
-; ZFINX64-NEXT:    sw s8, 40(sp)
-; ZFINX64-NEXT:    sw s7, 36(sp)
-; ZFINX64-NEXT:    sw s6, 32(sp)
-; ZFINX64-NEXT:    sw s5, 28(sp)
-; ZFINX64-NEXT:    sw s4, 24(sp)
-; ZFINX64-NEXT:    sw s3, 20(sp)
-; ZFINX64-NEXT:    sw s2, 16(sp)
-; ZFINX64-NEXT:    sw s1, 12(sp)
-; ZFINX64-NEXT:    sw s0, 8(sp)
-; ZFINX64-NEXT:    sw t4, 4(sp)
-; ZFINX64-NEXT:    sw t5, 0(sp)
-; ZFINX64-NEXT:    ld t2, 112(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld t3, 104(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld t4, 96(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld t5, 88(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    sw t0, 76(sp)
+; ZFINX64-NEXT:    sw t1, 72(sp)
+; ZFINX64-NEXT:    sw t2, 68(sp)
+; ZFINX64-NEXT:    sw t3, 64(sp)
+; ZFINX64-NEXT:    sw ra, 60(sp)
+; ZFINX64-NEXT:    sw s11, 56(sp)
+; ZFINX64-NEXT:    sw s10, 52(sp)
+; ZFINX64-NEXT:    sw s9, 48(sp)
+; ZFINX64-NEXT:    sw s8, 44(sp)
+; ZFINX64-NEXT:    sw s7, 40(sp)
+; ZFINX64-NEXT:    sw s6, 36(sp)
+; ZFINX64-NEXT:    sw s5, 32(sp)
+; ZFINX64-NEXT:    sw s4, 28(sp)
+; ZFINX64-NEXT:    sw s3, 24(sp)
+; ZFINX64-NEXT:    sw s2, 20(sp)
+; ZFINX64-NEXT:    sw s1, 16(sp)
+; ZFINX64-NEXT:    sw s0, 12(sp)
+; ZFINX64-NEXT:    sw t4, 8(sp)
+; ZFINX64-NEXT:    sw t5, 4(sp)
+; ZFINX64-NEXT:    sw t6, 0(sp)
+; ZFINX64-NEXT:    ld t3, 112(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld t4, 104(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld t5, 96(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld t6, 88(sp) # 8-byte Folded Reload
 ; ZFINX64-NEXT:    call callee_float_32
 ; ZFINX64-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; ZFINX64-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
@@ -1162,86 +1172,87 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ;
 ; ZDINX32-LABEL: caller_float_32:
 ; ZDINX32:       # %bb.0:
-; ZDINX32-NEXT:    addi sp, sp, -144
-; ZDINX32-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    lw t0, 144(sp)
-; ZDINX32-NEXT:    sw t0, 88(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    lw t0, 148(sp)
-; ZDINX32-NEXT:    sw t0, 84(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    lw t0, 152(sp)
-; ZDINX32-NEXT:    sw t0, 80(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    lw t0, 156(sp)
-; ZDINX32-NEXT:    sw t0, 76(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    lw t6, 160(sp)
-; ZDINX32-NEXT:    lw t5, 164(sp)
-; ZDINX32-NEXT:    lw t4, 168(sp)
-; ZDINX32-NEXT:    lw s0, 172(sp)
-; ZDINX32-NEXT:    lw s1, 176(sp)
-; ZDINX32-NEXT:    lw s2, 180(sp)
-; ZDINX32-NEXT:    lw s3, 184(sp)
-; ZDINX32-NEXT:    lw s4, 188(sp)
-; ZDINX32-NEXT:    lw s5, 192(sp)
-; ZDINX32-NEXT:    lw s6, 196(sp)
-; ZDINX32-NEXT:    lw s7, 200(sp)
-; ZDINX32-NEXT:    lw s8, 204(sp)
-; ZDINX32-NEXT:    lw s9, 208(sp)
-; ZDINX32-NEXT:    lw s10, 212(sp)
-; ZDINX32-NEXT:    lw s11, 216(sp)
-; ZDINX32-NEXT:    lw ra, 220(sp)
-; ZDINX32-NEXT:    lw t3, 224(sp)
-; ZDINX32-NEXT:    lw t2, 228(sp)
-; ZDINX32-NEXT:    lw t1, 232(sp)
-; ZDINX32-NEXT:    lw t0, 236(sp)
-; ZDINX32-NEXT:    sw t0, 72(sp)
-; ZDINX32-NEXT:    sw t1, 68(sp)
-; ZDINX32-NEXT:    sw t2, 64(sp)
-; ZDINX32-NEXT:    sw t3, 60(sp)
-; ZDINX32-NEXT:    sw ra, 56(sp)
-; ZDINX32-NEXT:    sw s11, 52(sp)
-; ZDINX32-NEXT:    sw s10, 48(sp)
-; ZDINX32-NEXT:    sw s9, 44(sp)
-; ZDINX32-NEXT:    sw s8, 40(sp)
-; ZDINX32-NEXT:    sw s7, 36(sp)
-; ZDINX32-NEXT:    sw s6, 32(sp)
-; ZDINX32-NEXT:    sw s5, 28(sp)
-; ZDINX32-NEXT:    sw s4, 24(sp)
-; ZDINX32-NEXT:    sw s3, 20(sp)
-; ZDINX32-NEXT:    sw s2, 16(sp)
-; ZDINX32-NEXT:    sw s1, 12(sp)
-; ZDINX32-NEXT:    sw s0, 8(sp)
-; ZDINX32-NEXT:    sw t4, 4(sp)
-; ZDINX32-NEXT:    sw t5, 0(sp)
-; ZDINX32-NEXT:    lw t2, 88(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw t3, 84(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw t4, 80(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw t5, 76(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    addi sp, sp, -160
+; ZDINX32-NEXT:    sw ra, 156(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s0, 152(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s1, 148(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s2, 144(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s3, 140(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s4, 136(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s5, 132(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s6, 128(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s7, 124(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s8, 120(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s9, 116(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s10, 112(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s11, 108(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    lw t0, 160(sp)
+; ZDINX32-NEXT:    sw t0, 104(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    lw t0, 164(sp)
+; ZDINX32-NEXT:    sw t0, 100(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    lw t0, 168(sp)
+; ZDINX32-NEXT:    sw t0, 96(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    lw t0, 172(sp)
+; ZDINX32-NEXT:    sw t0, 92(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    lw t6, 176(sp)
+; ZDINX32-NEXT:    lw t5, 180(sp)
+; ZDINX32-NEXT:    lw t4, 184(sp)
+; ZDINX32-NEXT:    lw s0, 188(sp)
+; ZDINX32-NEXT:    lw s1, 192(sp)
+; ZDINX32-NEXT:    lw s2, 196(sp)
+; ZDINX32-NEXT:    lw s3, 200(sp)
+; ZDINX32-NEXT:    lw s4, 204(sp)
+; ZDINX32-NEXT:    lw s5, 208(sp)
+; ZDINX32-NEXT:    lw s6, 212(sp)
+; ZDINX32-NEXT:    lw s7, 216(sp)
+; ZDINX32-NEXT:    lw s8, 220(sp)
+; ZDINX32-NEXT:    lw s9, 224(sp)
+; ZDINX32-NEXT:    lw s10, 228(sp)
+; ZDINX32-NEXT:    lw s11, 232(sp)
+; ZDINX32-NEXT:    lw ra, 236(sp)
+; ZDINX32-NEXT:    lw t3, 240(sp)
+; ZDINX32-NEXT:    lw t2, 244(sp)
+; ZDINX32-NEXT:    lw t1, 248(sp)
+; ZDINX32-NEXT:    lw t0, 252(sp)
+; ZDINX32-NEXT:    sw t0, 76(sp)
+; ZDINX32-NEXT:    sw t1, 72(sp)
+; ZDINX32-NEXT:    sw t2, 68(sp)
+; ZDINX32-NEXT:    sw t3, 64(sp)
+; ZDINX32-NEXT:    sw ra, 60(sp)
+; ZDINX32-NEXT:    sw s11, 56(sp)
+; ZDINX32-NEXT:    sw s10, 52(sp)
+; ZDINX32-NEXT:    sw s9, 48(sp)
+; ZDINX32-NEXT:    sw s8, 44(sp)
+; ZDINX32-NEXT:    sw s7, 40(sp)
+; ZDINX32-NEXT:    sw s6, 36(sp)
+; ZDINX32-NEXT:    sw s5, 32(sp)
+; ZDINX32-NEXT:    sw s4, 28(sp)
+; ZDINX32-NEXT:    sw s3, 24(sp)
+; ZDINX32-NEXT:    sw s2, 20(sp)
+; ZDINX32-NEXT:    sw s1, 16(sp)
+; ZDINX32-NEXT:    sw s0, 12(sp)
+; ZDINX32-NEXT:    sw t4, 8(sp)
+; ZDINX32-NEXT:    sw t5, 4(sp)
+; ZDINX32-NEXT:    sw t6, 0(sp)
+; ZDINX32-NEXT:    lw t3, 104(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw t4, 100(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw t5, 96(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw t6, 92(sp) # 4-byte Folded Reload
 ; ZDINX32-NEXT:    call callee_float_32
-; ZDINX32-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    addi sp, sp, 144
+; ZDINX32-NEXT:    lw ra, 156(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s0, 152(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s1, 148(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s2, 144(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s3, 140(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s4, 136(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s5, 132(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s6, 128(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s7, 124(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s8, 120(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s9, 116(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s10, 112(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s11, 108(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    addi sp, sp, 160
 ; ZDINX32-NEXT:    ret
 ;
 ; ZDINX64-LABEL: caller_float_32:
@@ -1288,29 +1299,30 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZDINX64-NEXT:    lw t2, 392(sp)
 ; ZDINX64-NEXT:    lw t1, 400(sp)
 ; ZDINX64-NEXT:    lw t0, 408(sp)
-; ZDINX64-NEXT:    sw t0, 72(sp)
-; ZDINX64-NEXT:    sw t1, 68(sp)
-; ZDINX64-NEXT:    sw t2, 64(sp)
-; ZDINX64-NEXT:    sw t3, 60(sp)
-; ZDINX64-NEXT:    sw ra, 56(sp)
-; ZDINX64-NEXT:    sw s11, 52(sp)
-; ZDINX64-NEXT:    sw s10, 48(sp)
-; ZDINX64-NEXT:    sw s9, 44(sp)
-; ZDINX64-NEXT:    sw s8, 40(sp)
-; ZDINX64-NEXT:    sw s7, 36(sp)
-; ZDINX64-NEXT:    sw s6, 32(sp)
-; ZDINX64-NEXT:    sw s5, 28(sp)
-; ZDINX64-NEXT:    sw s4, 24(sp)
-; ZDINX64-NEXT:    sw s3, 20(sp)
-; ZDINX64-NEXT:    sw s2, 16(sp)
-; ZDINX64-NEXT:    sw s1, 12(sp)
-; ZDINX64-NEXT:    sw s0, 8(sp)
-; ZDINX64-NEXT:    sw t4, 4(sp)
-; ZDINX64-NEXT:    sw t5, 0(sp)
-; ZDINX64-NEXT:    ld t2, 112(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld t3, 104(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld t4, 96(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld t5, 88(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    sw t0, 76(sp)
+; ZDINX64-NEXT:    sw t1, 72(sp)
+; ZDINX64-NEXT:    sw t2, 68(sp)
+; ZDINX64-NEXT:    sw t3, 64(sp)
+; ZDINX64-NEXT:    sw ra, 60(sp)
+; ZDINX64-NEXT:    sw s11, 56(sp)
+; ZDINX64-NEXT:    sw s10, 52(sp)
+; ZDINX64-NEXT:    sw s9, 48(sp)
+; ZDINX64-NEXT:    sw s8, 44(sp)
+; ZDINX64-NEXT:    sw s7, 40(sp)
+; ZDINX64-NEXT:    sw s6, 36(sp)
+; ZDINX64-NEXT:    sw s5, 32(sp)
+; ZDINX64-NEXT:    sw s4, 28(sp)
+; ZDINX64-NEXT:    sw s3, 24(sp)
+; ZDINX64-NEXT:    sw s2, 20(sp)
+; ZDINX64-NEXT:    sw s1, 16(sp)
+; ZDINX64-NEXT:    sw s0, 12(sp)
+; ZDINX64-NEXT:    sw t4, 8(sp)
+; ZDINX64-NEXT:    sw t5, 4(sp)
+; ZDINX64-NEXT:    sw t6, 0(sp)
+; ZDINX64-NEXT:    ld t3, 112(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld t4, 104(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld t5, 96(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld t6, 88(sp) # 8-byte Folded Reload
 ; ZDINX64-NEXT:    call callee_float_32
 ; ZDINX64-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; ZDINX64-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
index ee9f96a45d23e..fb84a2528778a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
@@ -502,8 +502,8 @@ define fastcc <vscale x 32 x i32> @vector_arg_indirect_stack(i32 %0, i32 %1, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, t4, a0
-; CHECK-NEXT:    vl8re32.v v24, (t4)
+; CHECK-NEXT:    add a0, t5, a0
+; CHECK-NEXT:    vl8re32.v v24, (t5)
 ; CHECK-NEXT:    vl8re32.v v0, (a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vadd.vv v8, v8, v24
@@ -521,25 +521,31 @@ define fastcc <vscale x 32 x i32> @pass_vector_arg_indirect_stack(<vscale x 32 x
 ; RV32-NEXT:    .cfi_def_cfa_offset 144
 ; RV32-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    .cfi_offset s1, -12
 ; RV32-NEXT:    addi s0, sp, 144
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    sub sp, sp, a0
 ; RV32-NEXT:    andi sp, sp, -128
+; RV32-NEXT:    mv s1, sp
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    addi a1, sp, 128
+; RV32-NEXT:    addi a1, s1, 128
 ; RV32-NEXT:    vs8r.v v8, (a1)
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    add a2, s1, a2
 ; RV32-NEXT:    addi a2, a2, 128
 ; RV32-NEXT:    vs8r.v v8, (a2)
+; RV32-NEXT:    li a3, 8
+; RV32-NEXT:    sw a3, 0(sp)
 ; RV32-NEXT:    add a1, a1, a0
 ; RV32-NEXT:    vs8r.v v8, (a1)
 ; RV32-NEXT:    add a0, a2, a0
@@ -550,47 +556,54 @@ define fastcc <vscale x 32 x i32> @pass_vector_arg_indirect_stack(<vscale x 32 x
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    li a6, 6
 ; RV32-NEXT:    li a7, 7
-; RV32-NEXT:    csrr t2, vlenb
-; RV32-NEXT:    slli t2, t2, 4
-; RV32-NEXT:    add t2, sp, t2
-; RV32-NEXT:    addi t2, t2, 128
-; RV32-NEXT:    addi t4, sp, 128
-; RV32-NEXT:    li t6, 8
+; RV32-NEXT:    csrr t3, vlenb
+; RV32-NEXT:    slli t3, t3, 4
+; RV32-NEXT:    add t3, s1, t3
+; RV32-NEXT:    addi t3, t3, 128
+; RV32-NEXT:    addi t5, s1, 128
 ; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    vmv.v.i v16, 0
 ; RV32-NEXT:    call vector_arg_indirect_stack
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    addi sp, s0, -144
 ; RV32-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 144
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: pass_vector_arg_indirect_stack:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -144
-; RV64-NEXT:    .cfi_def_cfa_offset 144
-; RV64-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 128(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi sp, sp, -160
+; RV64-NEXT:    .cfi_def_cfa_offset 160
+; RV64-NEXT:    sd ra, 152(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 144(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 136(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
 ; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 144
+; RV64-NEXT:    .cfi_offset s1, -24
+; RV64-NEXT:    addi s0, sp, 160
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 5
 ; RV64-NEXT:    sub sp, sp, a0
 ; RV64-NEXT:    andi sp, sp, -128
+; RV64-NEXT:    mv s1, sp
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    addi a1, sp, 128
+; RV64-NEXT:    addi a1, s1, 128
 ; RV64-NEXT:    vs8r.v v8, (a1)
 ; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    slli a2, a2, 4
-; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    add a2, s1, a2
 ; RV64-NEXT:    addi a2, a2, 128
 ; RV64-NEXT:    vs8r.v v8, (a2)
+; RV64-NEXT:    li a3, 8
+; RV64-NEXT:    sd a3, 0(sp)
 ; RV64-NEXT:    add a1, a1, a0
 ; RV64-NEXT:    vs8r.v v8, (a1)
 ; RV64-NEXT:    add a0, a2, a0
@@ -601,20 +614,21 @@ define fastcc <vscale x 32 x i32> @pass_vector_arg_indirect_stack(<vscale x 32 x
 ; RV64-NEXT:    li a5, 5
 ; RV64-NEXT:    li a6, 6
 ; RV64-NEXT:    li a7, 7
-; RV64-NEXT:    csrr t2, vlenb
-; RV64-NEXT:    slli t2, t2, 4
-; RV64-NEXT:    add t2, sp, t2
-; RV64-NEXT:    addi t2, t2, 128
-; RV64-NEXT:    addi t4, sp, 128
-; RV64-NEXT:    li t6, 8
+; RV64-NEXT:    csrr t3, vlenb
+; RV64-NEXT:    slli t3, t3, 4
+; RV64-NEXT:    add t3, s1, t3
+; RV64-NEXT:    addi t3, t3, 128
+; RV64-NEXT:    addi t5, s1, 128
 ; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    li a0, 0
 ; RV64-NEXT:    vmv.v.i v16, 0
 ; RV64-NEXT:    call vector_arg_indirect_stack
-; RV64-NEXT:    addi sp, s0, -144
-; RV64-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 128(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 144
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    addi sp, s0, -160
+; RV64-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 144(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 136(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 160
 ; RV64-NEXT:    ret
   %s = call fastcc <vscale x 32 x i32> @vector_arg_indirect_stack(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, <vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 8)
   ret <vscale x 32 x i32> %s
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll
index 63cd42e97ef6f..9f48fdb3608a0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll
@@ -230,7 +230,7 @@ define fastcc <32 x i32> @vector_arg_indirect_stack(i32 %0, i32 %1, i32 %2, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v16, (t2)
+; CHECK-NEXT:    vle32.v v16, (t3)
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %s = add <32 x i32> %x, %z
@@ -261,8 +261,8 @@ define fastcc <32 x i32> @pass_vector_arg_indirect_stack(<32 x i32> %x, <32 x i3
 ; CHECK-NEXT:    li a5, 5
 ; CHECK-NEXT:    li a6, 6
 ; CHECK-NEXT:    li a7, 7
-; CHECK-NEXT:    mv t2, sp
-; CHECK-NEXT:    li t3, 8
+; CHECK-NEXT:    mv t3, sp
+; CHECK-NEXT:    li t4, 8
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    li a0, 0
 ; CHECK-NEXT:    vmv.v.i v16, 0
@@ -281,7 +281,7 @@ define fastcc <32 x i32> @vector_arg_direct_stack(i32 %0, i32 %1, i32 %2, i32 %3
 ; CHECK-LABEL: vector_arg_direct_stack:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    addi a1, sp, 8
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v24, (a1)
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
@@ -303,11 +303,13 @@ define fastcc <32 x i32> @pass_vector_arg_direct_stack(<32 x i32> %x, <32 x i32>
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    addi a0, sp, 8
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    sd a0, 136(sp)
+; CHECK-NEXT:    sd a0, 144(sp)
 ; CHECK-NEXT:    li a0, 13
+; CHECK-NEXT:    sd a0, 8(sp)
+; CHECK-NEXT:    li a0, 12
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    li a2, 2
 ; CHECK-NEXT:    li a3, 3
@@ -315,11 +317,10 @@ define fastcc <32 x i32> @pass_vector_arg_direct_stack(<32 x i32> %x, <32 x i32>
 ; CHECK-NEXT:    li a5, 5
 ; CHECK-NEXT:    li a6, 6
 ; CHECK-NEXT:    li a7, 7
-; CHECK-NEXT:    li t2, 8
-; CHECK-NEXT:    li t3, 9
-; CHECK-NEXT:    li t4, 10
-; CHECK-NEXT:    li t5, 11
-; CHECK-NEXT:    li t6, 12
+; CHECK-NEXT:    li t3, 8
+; CHECK-NEXT:    li t4, 9
+; CHECK-NEXT:    li t5, 10
+; CHECK-NEXT:    li t6, 11
 ; CHECK-NEXT:    sd a0, 0(sp)
 ; CHECK-NEXT:    li a0, 0
 ; CHECK-NEXT:    vmv.v.i v16, 0
@@ -336,7 +337,7 @@ define fastcc <32 x i32> @pass_vector_arg_direct_stack(<32 x i32> %x, <32 x i32>
 define fastcc <4 x i1> @vector_mask_arg_direct_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, <32 x i32> %x, <32 x i32> %y, <32 x i32> %z, <4 x i1> %m1, <4 x i1> %m2, i32 %last) {
 ; CHECK-LABEL: vector_mask_arg_direct_stack:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi a0, sp, 136
+; CHECK-NEXT:    addi a0, sp, 144
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vlm.v v8, (a0)
 ; CHECK-NEXT:    vmxor.mm v0, v0, v8

>From 09acc16171bd22048e96c7e3fab0c97a119adbf9 Mon Sep 17 00:00:00 2001
From: Yeting Kuo <yeting.kuo at sifive.com>
Date: Mon, 15 Jul 2024 22:59:18 -0700
Subject: [PATCH 2/2] Add comments about the reason to remove x7.

---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 2d7fcd0225c51..ee3912f928f33 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -18884,6 +18884,7 @@ ArrayRef<MCPhysReg> RISCV::getArgGPRs(const RISCVABI::ABI ABI) {
 static ArrayRef<MCPhysReg> getFastCCArgGPRs(const RISCVABI::ABI ABI) {
   // The GPRs used for passing arguments in the FastCC, X5 and X6 might be used
   // for save-restore libcall, so we don't use them.
+  // Don't use X7 for fastcc, since Zicfilp uses X7 as the label register.
   static const MCPhysReg FastCCIGPRs[] = {
       RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13, RISCV::X14,
       RISCV::X15, RISCV::X16, RISCV::X17, RISCV::X28, RISCV::X29,



More information about the llvm-commits mailing list