[llvm] [NVPTX] support immediate values in st.param instructions (PR #91523)

Alex MacLean via llvm-commits llvm-commits at lists.llvm.org
Tue May 14 08:58:19 PDT 2024


https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/91523

>From d56a4ceadc417dcc5b8f1d4caaf952af7c130887 Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean at nvidia.com>
Date: Wed, 8 May 2024 19:22:17 +0000
Subject: [PATCH 1/4] [NVPTX] support immediate values in st.param instructions

---
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 131 ++++-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td     |  99 ++--
 llvm/test/CodeGen/NVPTX/st-param-imm.ll     | 592 ++++++++++++++++++++
 3 files changed, 768 insertions(+), 54 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/st-param-imm.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 595395bb1b4b4..fc9d760093d66 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -2182,6 +2182,84 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
   return true;
 }
 
+// Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri)
+#define getOpcV2H(ty, op0, op1) NVPTX::StoreParamV2##ty##_##op0##op1
+
+#define getOpcV2H1(ty, op0, op1)                                               \
+  (op1) ? getOpcV2H(ty, op0, i) : getOpcV2H(ty, op0, r)
+
+#define getOpcodeForVectorStParamV2(ty, isimm)                                 \
+  (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1])
+
+#define getOpcV4H(ty, op0, op1, op2, op3)                                      \
+  NVPTX::StoreParamV4##ty##_##op0##op1##op2##op3
+
+#define getOpcV4H3(ty, op0, op1, op2, op3)                                     \
+  (op3) ? getOpcV4H(ty, op0, op1, op2, i) : getOpcV4H(ty, op0, op1, op2, r)
+
+#define getOpcV4H2(ty, op0, op1, op2, op3)                                     \
+  (op2) ? getOpcV4H3(ty, op0, op1, i, op3) : getOpcV4H3(ty, op0, op1, r, op3)
+
+#define getOpcV4H1(ty, op0, op1, op2, op3)                                     \
+  (op1) ? getOpcV4H2(ty, op0, i, op2, op3) : getOpcV4H2(ty, op0, r, op2, op3)
+
+#define getOpcodeForVectorStParamV4(ty, isimm)                                 \
+  (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3])                 \
+             : getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3])
+
+#define getOpcodeForVectorStParam(n, ty, isimm)                                \
+  (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm)                            \
+           : getOpcodeForVectorStParamV4(ty, isimm)
+
+static std::optional<unsigned>
+pickOpcodeForVectorStParam(SmallVector<SDValue, 8> &Ops, unsigned NumElts,
+                           MVT::SimpleValueType MemTy, SelectionDAG *CurDAG,
+                           SDLoc DL) {
+  // Determine which inputs are registers and immediates make new operators
+  // with constant values
+  SmallVector<bool, 4> IsImm(NumElts, false);
+  for (unsigned i = 0; i < NumElts; i++) {
+    IsImm[i] = (isa<ConstantSDNode>(Ops[i]) || isa<ConstantFPSDNode>(Ops[i]));
+    if (IsImm[i]) {
+      SDValue Imm = Ops[i];
+      if (MemTy == MVT::f32 || MemTy == MVT::f64) {
+        const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
+        const ConstantFP *CF = ConstImm->getConstantFPValue();
+        Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
+      } else {
+        const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
+        const ConstantInt *CI = ConstImm->getConstantIntValue();
+        Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
+      }
+      Ops[i] = Imm;
+    }
+  }
+
+  // Get opcode for MemTy, size, and register/immediate operand ordering
+  switch (MemTy) {
+  case MVT::i8:
+    return getOpcodeForVectorStParam(NumElts, I8, IsImm);
+  case MVT::i16:
+    return getOpcodeForVectorStParam(NumElts, I16, IsImm);
+  case MVT::i32:
+    return getOpcodeForVectorStParam(NumElts, I32, IsImm);
+  case MVT::i64:
+    if (NumElts == 4)
+      return std::nullopt;
+    return getOpcodeForVectorStParamV2(I64, IsImm);
+  case MVT::f32:
+    return getOpcodeForVectorStParam(NumElts, F32, IsImm);
+  case MVT::f64:
+    if (NumElts == 4)
+      return std::nullopt;
+    return getOpcodeForVectorStParamV2(F64, IsImm);
+  case MVT::f16:
+  case MVT::v2f16:
+  default:
+    return std::nullopt;
+  }
+}
+
 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
   SDLoc DL(N);
   SDValue Chain = N->getOperand(0);
@@ -2228,12 +2306,34 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
     switch (NumElts) {
     default:
       return false;
-    case 1:
-      Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
-                               NVPTX::StoreParamI8, NVPTX::StoreParamI16,
-                               NVPTX::StoreParamI32, NVPTX::StoreParamI64,
-                               NVPTX::StoreParamF32, NVPTX::StoreParamF64);
-      if (Opcode == NVPTX::StoreParamI8) {
+    case 1: {
+      MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;
+      SDValue Imm = Ops[0];
+      if (MemTy != MVT::f16 && MemTy != MVT::v2f16 &&
+          (isa<ConstantSDNode>(Imm) || isa<ConstantFPSDNode>(Imm))) {
+        // Convert immediate to target constant
+        if (MemTy == MVT::f32 || MemTy == MVT::f64) {
+          const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
+          const ConstantFP *CF = ConstImm->getConstantFPValue();
+          Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
+        } else {
+          const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
+          const ConstantInt *CI = ConstImm->getConstantIntValue();
+          Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
+        }
+        Ops[0] = Imm;
+        // Use immediate version of store param
+        Opcode = pickOpcodeForVT(MemTy, NVPTX::StoreParamI8_i,
+                                 NVPTX::StoreParamI16_i, NVPTX::StoreParamI32_i,
+                                 NVPTX::StoreParamI64_i, NVPTX::StoreParamF32_i,
+                                 NVPTX::StoreParamF64_i);
+      } else
+        Opcode =
+            pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
+                            NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r,
+                            NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r,
+                            NVPTX::StoreParamF32_r, NVPTX::StoreParamF64_r);
+      if (Opcode == NVPTX::StoreParamI8_r) {
         // Fine tune the opcode depending on the size of the operand.
         // This helps to avoid creating redundant COPY instructions in
         // InstrEmitter::AddRegisterOperand().
@@ -2249,19 +2349,14 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
         }
       }
       break;
+    }
     case 2:
-      Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
-                               NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16,
-                               NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64,
-                               NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64);
-      break;
-    case 4:
-      Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
-                               NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16,
-                               NVPTX::StoreParamV4I32, std::nullopt,
-                               NVPTX::StoreParamV4F32, std::nullopt);
+    case 4: {
+      MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;
+      Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL);
       break;
     }
+    }
     if (!Opcode)
       return false;
     break;
@@ -2269,7 +2364,7 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
   // conversion instruction first, and use that as the value operand to
   // the selected StoreParam node.
   case NVPTXISD::StoreParamU32: {
-    Opcode = NVPTX::StoreParamI32;
+    Opcode = NVPTX::StoreParamI32_r;
     SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
                                                 MVT::i32);
     SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
@@ -2278,7 +2373,7 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
     break;
   }
   case NVPTXISD::StoreParamS32: {
-    Opcode = NVPTX::StoreParamI32;
+    Opcode = NVPTX::StoreParamI32_r;
     SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
                                                 MVT::i32);
     SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 393fa29ff0516..6c4badfeb742c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2637,25 +2637,49 @@ class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
                 [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
 
 let mayStore = true in {
-  class StoreParamInst<NVPTXRegClass regclass, string opstr> :
+  class StoreParamInstReg<NVPTXRegClass regclass, string opstr> :
         NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
-                  !strconcat("st.param", opstr, " \t[param$a+$b], $val;"),
+                  "st.param" # opstr # " \t[param$a+$b], $val;",
                   []>;
 
-  class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
-        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
-                               i32imm:$a, i32imm:$b),
-                  !strconcat("st.param.v2", opstr,
-                             " \t[param$a+$b], {{$val, $val2}};"),
-                  []>;
+  multiclass StoreParamInst<NVPTXRegClass regclass, Operand IMMType, string opstr> {
+    def _r: StoreParamInstReg<regclass, opstr>;
 
-  class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
-        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3,
-                               regclass:$val4, i32imm:$a,
-                               i32imm:$b),
-                  !strconcat("st.param.v4", opstr,
-                             " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
-                  []>;
+    def _i:
+          NVPTXInst<(outs), (ins IMMType:$val, i32imm:$a, i32imm:$b),
+                    "st.param" # opstr # " \t[param$a+$b], $val;",
+                    []>;
+  }
+
+  multiclass StoreParamV2Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> {
+    foreach op1 = [IMMType, regclass] in
+      foreach op2 = [IMMType, regclass] in
+        def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i")
+              # !if(!isa<NVPTXRegClass>(op2), "r", "i")
+          : NVPTXInst<(outs),
+                      (ins op1:$val1, op2:$val2,
+                           i32imm:$a, i32imm:$b),
+                      "st.param.v2" # opstr # " \t[param$a+$b], {{$val1, $val2}};",
+                      []>;
+  }
+
+  multiclass StoreParamV4Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> {
+    foreach op1 = [IMMType, regclass] in
+      foreach op2 = [IMMType, regclass] in
+        foreach op3 = [IMMType, regclass] in
+          foreach op4 = [IMMType, regclass] in
+            def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i")
+                  # !if(!isa<NVPTXRegClass>(op2), "r", "i")
+                  # !if(!isa<NVPTXRegClass>(op3), "r", "i")
+                  # !if(!isa<NVPTXRegClass>(op4), "r", "i")
+
+              : NVPTXInst<(outs),
+                          (ins op1:$val1, op2:$val2, op3:$val3, op4:$val4,
+                               i32imm:$a, i32imm:$b),
+                          "st.param.v4" # opstr #
+                          " \t[param$a+$b], {{$val1, $val2, $val3, $val4}};",
+                          []>;
+  }
 
   class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
         NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
@@ -2735,27 +2759,30 @@ def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".f32">;
 def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".f64">;
 def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".f32">;
 
-def StoreParamI64    : StoreParamInst<Int64Regs, ".b64">;
-def StoreParamI32    : StoreParamInst<Int32Regs, ".b32">;
-
-def StoreParamI16    : StoreParamInst<Int16Regs, ".b16">;
-def StoreParamI8     : StoreParamInst<Int16Regs, ".b8">;
-def StoreParamI8TruncI32 : StoreParamInst<Int32Regs, ".b8">;
-def StoreParamI8TruncI64 : StoreParamInst<Int64Regs, ".b8">;
-def StoreParamV2I64  : StoreParamV2Inst<Int64Regs, ".b64">;
-def StoreParamV2I32  : StoreParamV2Inst<Int32Regs, ".b32">;
-def StoreParamV2I16  : StoreParamV2Inst<Int16Regs, ".b16">;
-def StoreParamV2I8   : StoreParamV2Inst<Int16Regs, ".b8">;
-
-def StoreParamV4I32  : StoreParamV4Inst<Int32Regs, ".b32">;
-def StoreParamV4I16  : StoreParamV4Inst<Int16Regs, ".b16">;
-def StoreParamV4I8   : StoreParamV4Inst<Int16Regs, ".b8">;
-
-def StoreParamF32      : StoreParamInst<Float32Regs, ".f32">;
-def StoreParamF64      : StoreParamInst<Float64Regs, ".f64">;
-def StoreParamV2F32    : StoreParamV2Inst<Float32Regs, ".f32">;
-def StoreParamV2F64    : StoreParamV2Inst<Float64Regs, ".f64">;
-def StoreParamV4F32    : StoreParamV4Inst<Float32Regs, ".f32">;
+defm StoreParamI64    : StoreParamInst<Int64Regs, i64imm, ".b64">;
+defm StoreParamI32    : StoreParamInst<Int32Regs, i32imm, ".b32">;
+defm StoreParamI16    : StoreParamInst<Int16Regs, i16imm, ".b16">;
+defm StoreParamI8     : StoreParamInst<Int16Regs, i8imm,  ".b8">;
+
+def StoreParamI8TruncI32 : StoreParamInstReg<Int32Regs, ".b8">;
+def StoreParamI8TruncI64 : StoreParamInstReg<Int64Regs, ".b8">;
+
+defm StoreParamV2I64  : StoreParamV2Inst<Int64Regs, i64imm, ".b64">;
+defm StoreParamV2I32  : StoreParamV2Inst<Int32Regs, i32imm, ".b32">;
+defm StoreParamV2I16  : StoreParamV2Inst<Int16Regs, i16imm, ".b16">;
+defm StoreParamV2I8   : StoreParamV2Inst<Int16Regs, i8imm,  ".b8">;
+
+defm StoreParamV4I32  : StoreParamV4Inst<Int32Regs, i32imm, ".b32">;
+defm StoreParamV4I16  : StoreParamV4Inst<Int16Regs, i16imm, ".b16">;
+defm StoreParamV4I8   : StoreParamV4Inst<Int16Regs, i8imm,  ".b8">;
+
+defm StoreParamF32    : StoreParamInst<Float32Regs, f32imm, ".f32">;
+defm StoreParamF64    : StoreParamInst<Float64Regs, f64imm, ".f64">;
+
+defm StoreParamV2F32  : StoreParamV2Inst<Float32Regs, f32imm, ".f32">;
+defm StoreParamV2F64  : StoreParamV2Inst<Float64Regs, f64imm, ".f64">;
+
+defm StoreParamV4F32  : StoreParamV4Inst<Float32Regs, f32imm, ".f32">;
 
 def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
 def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
diff --git a/llvm/test/CodeGen/NVPTX/st-param-imm.ll b/llvm/test/CodeGen/NVPTX/st-param-imm.ll
new file mode 100644
index 0000000000000..022035505e2a7
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/st-param-imm.ll
@@ -0,0 +1,592 @@
+; RUN: llc < %s -march=nvptx64 | FileCheck %s
+; RUN: llc < %s -march=nvptx | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -verify-machineinstrs | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -verify-machineinstrs | %ptxas-verify %}
+
+%struct.A = type { i8, i16 }
+%struct.char2 = type { i8, i8 }
+%struct.char4 = type { i8, i8, i8, i8 }
+%struct.short2 = type { i16, i16 }
+%struct.short4 = type { i16, i16, i16, i16 }
+%struct.int2 = type { i32, i32 }
+%struct.int4 = type { i32, i32, i32, i32 }
+%struct.longlong2 = type { i64, i64 }
+%struct.float2 = type { float, float }
+%struct.float4 = type { float, float, float, float }
+%struct.double2 = type { double, double }
+
+; CHECK-LABEL: st_param_i8_i16
+; CHECK: st.param.b8 [param0+0], 1
+; CHECK: st.param.b16 [param0+2], 2
+define void @st_param_i8_i16() {
+  call void @call_i8_i16(%struct.A { i8 1, i16 2 })
+  ret void
+}
+
+; CHECK-LABEL: st_param_i32
+; CHECK: st.param.b32 [param0+0], 3
+define void @st_param_i32() {
+  call void @call_i32(i32 3)
+  ret void
+}
+
+; CHECK-LABEL: st_param_i64
+; CHECK: st.param.b64 [param0+0], 4
+define void @st_param_i64() {
+  call void @call_i64(i64 4)
+  ret void
+}
+
+; CHECK-LABEL: st_param_f32
+; CHECK: st.param.f32 [param0+0], 0f40A00000
+define void @st_param_f32() {
+  call void @call_f32(float 5.0)
+  ret void
+}
+
+; CHECK-LABEL: st_param_f64
+; CHECK: st.param.f64 [param0+0], 0d4018000000000000
+define void @st_param_f64() {
+  call void @call_f64(double 6.0)
+  ret void
+}
+
+declare void @call_i8_i16(%struct.A)
+declare void @call_i32(i32)
+declare void @call_i64(i64)
+declare void @call_f32(float)
+declare void @call_f64(double)
+
+; CHECK-LABEL: st_param_v2_i8
+; CHECK: st.param.v2.b8 [param0+0], {1, 2}
+; CHECK: st.param.v2.b8 [param0+0], {1, {{%rs[0-9]+}}}
+; CHECK: st.param.v2.b8 [param0+0], {{{%rs[0-9]+}}, 2}
+define void @st_param_v2_i8(i8 %val) {
+  call void @call_v2_i8(%struct.char2 { i8 1, i8 2 })
+  %struct.ir0 = insertvalue %struct.char2 poison, i8 1, 0
+  %struct.ir1 = insertvalue %struct.char2 %struct.ir0, i8 %val, 1
+  call void @call_v2_i8(%struct.char2 %struct.ir1)
+  %struct.ri0 = insertvalue %struct.char2 poison, i8 %val, 0
+  %struct.ri1 = insertvalue %struct.char2 %struct.ri0, i8 2, 1
+  call void @call_v2_i8(%struct.char2 %struct.ri1)
+  ret void
+}
+
+; CHECK-LABEL: st_param_v2_i16
+; CHECK: st.param.v2.b16 [param0+0], {1, 2}
+; CHECK: st.param.v2.b16 [param0+0], {1, {{%rs[0-9]+}}}
+; CHECK: st.param.v2.b16 [param0+0], {{{%rs[0-9]+}}, 2}
+define void @st_param_v2_i16(i16 %val) {
+  call void @call_v2_i16(%struct.short2 { i16 1, i16 2 })
+  %struct.ir0 = insertvalue %struct.short2 poison, i16 1, 0
+  %struct.ir1 = insertvalue %struct.short2 %struct.ir0, i16 %val, 1
+  call void @call_v2_i16(%struct.short2 %struct.ir1)
+  %struct.ri0 = insertvalue %struct.short2 poison, i16 %val, 0
+  %struct.ri1 = insertvalue %struct.short2 %struct.ri0, i16 2, 1
+  call void @call_v2_i16(%struct.short2 %struct.ri1)
+  ret void
+}
+
+; CHECK-LABEL: st_param_v2_i32
+; CHECK: st.param.v2.b32 [param0+0], {1, 2}
+; CHECK: st.param.v2.b32 [param0+0], {1, {{%r[0-9]+}}}
+; CHECK: st.param.v2.b32 [param0+0], {{{%r[0-9]+}}, 2}
+define void @st_param_v2_i32(i32 %val) {
+  call void @call_v2_i32(%struct.int2 { i32 1, i32 2 })
+  %struct.ir0 = insertvalue %struct.int2 poison, i32 1, 0
+  %struct.ir1 = insertvalue %struct.int2 %struct.ir0, i32 %val, 1
+  call void @call_v2_i32(%struct.int2 %struct.ir1)
+  %struct.ri0 = insertvalue %struct.int2 poison, i32 %val, 0
+  %struct.ri1 = insertvalue %struct.int2 %struct.ri0, i32 2, 1
+  call void @call_v2_i32(%struct.int2 %struct.ri1)
+  ret void
+}
+
+; CHECK-LABEL: st_param_v2_i64
+; CHECK: st.param.v2.b64 [param0+0], {1, 2}
+; CHECK: st.param.v2.b64 [param0+0], {1, {{%rd[0-9]+}}}
+; CHECK: st.param.v2.b64 [param0+0], {{{%rd[0-9]+}}, 2}
+define void @st_param_v2_i64(i64 %val) {
+  call void @call_v2_i64(%struct.longlong2 { i64 1, i64 2 })
+  %struct.ir0 = insertvalue %struct.longlong2 poison, i64 1, 0
+  %struct.ir1 = insertvalue %struct.longlong2 %struct.ir0, i64 %val, 1
+  call void @call_v2_i64(%struct.longlong2 %struct.ir1)
+  %struct.ri0 = insertvalue %struct.longlong2 poison, i64 %val, 0
+  %struct.ri1 = insertvalue %struct.longlong2 %struct.ri0, i64 2, 1
+  call void @call_v2_i64(%struct.longlong2 %struct.ri1)
+  ret void
+}
+
+; CHECK-LABEL: st_param_v2_f32
+; CHECK: st.param.v2.f32 [param0+0], {0f3F800000, 0f40000000}
+; CHECK: st.param.v2.f32 [param0+0], {0f3F800000, {{%f[0-9]+}}}
+; CHECK: st.param.v2.f32 [param0+0], {{{%f[0-9]+}}, 0f40000000}
+define void @st_param_v2_f32(float %val) {
+  call void @call_v2_f32(%struct.float2 { float 1.0, float 2.0 })
+  %struct.ir0 = insertvalue %struct.float2 poison, float 1.0, 0
+  %struct.ir1 = insertvalue %struct.float2 %struct.ir0, float %val, 1
+  call void @call_v2_f32(%struct.float2 %struct.ir1)
+  %struct.ri0 = insertvalue %struct.float2 poison, float %val, 0
+  %struct.ri1 = insertvalue %struct.float2 %struct.ri0, float 2.0, 1
+  call void @call_v2_f32(%struct.float2 %struct.ri1)
+  ret void
+}
+
+; CHECK-LABEL: st_param_v2_f64
+; CHECK: st.param.v2.f64 [param0+0], {0d3FF0000000000000, 0d4000000000000000}
+; CHECK: st.param.v2.f64 [param0+0], {0d3FF0000000000000, {{%fd[0-9]+}}}
+; CHECK: st.param.v2.f64 [param0+0], {{{%fd[0-9]+}}, 0d4000000000000000}
+define void @st_param_v2_f64(double %val) {
+  call void @call_v2_f64(%struct.double2 { double 1.0, double 2.0 })
+  %struct.ir0 = insertvalue %struct.double2 poison, double 1.0, 0
+  %struct.ir1 = insertvalue %struct.double2 %struct.ir0, double %val, 1
+  call void @call_v2_f64(%struct.double2 %struct.ir1)
+  %struct.ri0 = insertvalue %struct.double2 poison, double %val, 0
+  %struct.ri1 = insertvalue %struct.double2 %struct.ri0, double 2.0, 1
+  call void @call_v2_f64(%struct.double2 %struct.ri1)
+  ret void
+}
+
+declare void @call_v2_i8(%struct.char2)
+declare void @call_v2_i16(%struct.short2)
+declare void @call_v2_i32(%struct.int2)
+declare void @call_v2_i64(%struct.longlong2)
+declare void @call_v2_f32(%struct.float2)
+declare void @call_v2_f64(%struct.double2)
+
+; CHECK-LABEL: st_param_v4_i8
+; CHECK: st.param.v4.b8 [param0+0], {1, 2, 3, 4}
+; CHECK: st.param.v4.b8 [param0+0], {1, {{%rs[0-9]+}}, {{%rs[0-9]+}}, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b8 [param0+0], {{{%rs[0-9]+}}, 2, {{%rs[0-9]+}}, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b8 [param0+0], {{{%rs[0-9]+}}, {{%rs[0-9]+}}, 3, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b8 [param0+0], {{{%rs[0-9]+}}, {{%rs[0-9]+}}, {{%rs[0-9]+}}, 4}
+; CHECK: st.param.v4.b8 [param0+0], {1, 2, {{%rs[0-9]+}}, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b8 [param0+0], {1, {{%rs[0-9]+}}, 3, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b8 [param0+0], {1, {{%rs[0-9]+}}, {{%rs[0-9]+}}, 4}
+; CHECK: st.param.v4.b8 [param0+0], {{{%rs[0-9]+}}, 2, 3, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b8 [param0+0], {{{%rs[0-9]+}}, 2, {{%rs[0-9]+}}, 4}
+; CHECK: st.param.v4.b8 [param0+0], {{{%rs[0-9]+}}, {{%rs[0-9]+}}, 3, 4}
+; CHECK: st.param.v4.b8 [param0+0], {1, 2, 3, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b8 [param0+0], {1, 2, {{%rs[0-9]+}}, 4}
+; CHECK: st.param.v4.b8 [param0+0], {1, {{%rs[0-9]+}}, 3, 4}
+; CHECK: st.param.v4.b8 [param0+0], {{{%rs[0-9]+}}, 2, 3, 4}
+define void @st_param_v4_i8(i8 %a, i8 %b, i8 %c, i8 %d) {
+  call void @call_v4_i8(%struct.char4 { i8 1, i8 2, i8 3, i8 4 })
+
+  %struct.irrr0 = insertvalue %struct.char4 poison, i8 1, 0
+  %struct.irrr1 = insertvalue %struct.char4 %struct.irrr0, i8 %b, 1
+  %struct.irrr2 = insertvalue %struct.char4 %struct.irrr1, i8 %c, 2
+  %struct.irrr3 = insertvalue %struct.char4 %struct.irrr2, i8 %d, 3
+  call void @call_v4_i8(%struct.char4 %struct.irrr3)
+
+  %struct.rirr0 = insertvalue %struct.char4 poison, i8 %a, 0
+  %struct.rirr1 = insertvalue %struct.char4 %struct.rirr0, i8 2, 1
+  %struct.rirr2 = insertvalue %struct.char4 %struct.rirr1, i8 %c, 2
+  %struct.rirr3 = insertvalue %struct.char4 %struct.rirr2, i8 %d, 3
+  call void @call_v4_i8(%struct.char4 %struct.rirr3)
+
+  %struct.rrir0 = insertvalue %struct.char4 poison, i8 %a, 0
+  %struct.rrir1 = insertvalue %struct.char4 %struct.rrir0, i8 %b, 1
+  %struct.rrir2 = insertvalue %struct.char4 %struct.rrir1, i8 3, 2
+  %struct.rrir3 = insertvalue %struct.char4 %struct.rrir2, i8 %d, 3
+  call void @call_v4_i8(%struct.char4 %struct.rrir3)
+
+  %struct.rrri0 = insertvalue %struct.char4 poison, i8 %a, 0
+  %struct.rrri1 = insertvalue %struct.char4 %struct.rrri0, i8 %b, 1
+  %struct.rrri2 = insertvalue %struct.char4 %struct.rrri1, i8 %c, 2
+  %struct.rrri3 = insertvalue %struct.char4 %struct.rrri2, i8 4, 3
+  call void @call_v4_i8(%struct.char4 %struct.rrri3)
+
+  %struct.iirr0 = insertvalue %struct.char4 poison, i8 1, 0
+  %struct.iirr1 = insertvalue %struct.char4 %struct.iirr0, i8 2, 1
+  %struct.iirr2 = insertvalue %struct.char4 %struct.iirr1, i8 %c, 2
+  %struct.iirr3 = insertvalue %struct.char4 %struct.iirr2, i8 %d, 3
+  call void @call_v4_i8(%struct.char4 %struct.iirr3)
+
+  %struct.irir0 = insertvalue %struct.char4 poison, i8 1, 0
+  %struct.irir1 = insertvalue %struct.char4 %struct.irir0, i8 %b, 1
+  %struct.irir2 = insertvalue %struct.char4 %struct.irir1, i8 3, 2
+  %struct.irir3 = insertvalue %struct.char4 %struct.irir2, i8 %d, 3
+  call void @call_v4_i8(%struct.char4 %struct.irir3)
+
+  %struct.irri0 = insertvalue %struct.char4 poison, i8 1, 0
+  %struct.irri1 = insertvalue %struct.char4 %struct.irri0, i8 %b, 1
+  %struct.irri2 = insertvalue %struct.char4 %struct.irri1, i8 %c, 2
+  %struct.irri3 = insertvalue %struct.char4 %struct.irri2, i8 4, 3
+  call void @call_v4_i8(%struct.char4 %struct.irri3)
+
+  %struct.riir0 = insertvalue %struct.char4 poison, i8 %a, 0
+  %struct.riir1 = insertvalue %struct.char4 %struct.riir0, i8 2, 1
+  %struct.riir2 = insertvalue %struct.char4 %struct.riir1, i8 3, 2
+  %struct.riir3 = insertvalue %struct.char4 %struct.riir2, i8 %d, 3
+  call void @call_v4_i8(%struct.char4 %struct.riir3)
+
+  %struct.riri0 = insertvalue %struct.char4 poison, i8 %a, 0
+  %struct.riri1 = insertvalue %struct.char4 %struct.riri0, i8 2, 1
+  %struct.riri2 = insertvalue %struct.char4 %struct.riri1, i8 %c, 2
+  %struct.riri3 = insertvalue %struct.char4 %struct.riri2, i8 4, 3
+  call void @call_v4_i8(%struct.char4 %struct.riri3)
+
+  %struct.rrii0 = insertvalue %struct.char4 poison, i8 %a, 0
+  %struct.rrii1 = insertvalue %struct.char4 %struct.rrii0, i8 %b, 1
+  %struct.rrii2 = insertvalue %struct.char4 %struct.rrii1, i8 3, 2
+  %struct.rrii3 = insertvalue %struct.char4 %struct.rrii2, i8 4, 3
+  call void @call_v4_i8(%struct.char4 %struct.rrii3)
+
+  %struct.iiir0 = insertvalue %struct.char4 poison, i8 1, 0
+  %struct.iiir1 = insertvalue %struct.char4 %struct.iiir0, i8 2, 1
+  %struct.iiir2 = insertvalue %struct.char4 %struct.iiir1, i8 3, 2
+  %struct.iiir3 = insertvalue %struct.char4 %struct.iiir2, i8 %d, 3
+  call void @call_v4_i8(%struct.char4 %struct.iiir3)
+
+  %struct.iiri0 = insertvalue %struct.char4 poison, i8 1, 0
+  %struct.iiri1 = insertvalue %struct.char4 %struct.iiri0, i8 2, 1
+  %struct.iiri2 = insertvalue %struct.char4 %struct.iiri1, i8 %c, 2
+  %struct.iiri3 = insertvalue %struct.char4 %struct.iiri2, i8 4, 3
+  call void @call_v4_i8(%struct.char4 %struct.iiri3)
+
+  %struct.irii0 = insertvalue %struct.char4 poison, i8 1, 0
+  %struct.irii1 = insertvalue %struct.char4 %struct.irii0, i8 %b, 1
+  %struct.irii2 = insertvalue %struct.char4 %struct.irii1, i8 3, 2
+  %struct.irii3 = insertvalue %struct.char4 %struct.irii2, i8 4, 3
+  call void @call_v4_i8(%struct.char4 %struct.irii3)
+
+  %struct.riii0 = insertvalue %struct.char4 poison, i8 %a, 0
+  %struct.riii1 = insertvalue %struct.char4 %struct.riii0, i8 2, 1
+  %struct.riii2 = insertvalue %struct.char4 %struct.riii1, i8 3, 2
+  %struct.riii3 = insertvalue %struct.char4 %struct.riii2, i8 4, 3
+  call void @call_v4_i8(%struct.char4 %struct.riii3)
+  ret void
+}
+
+; CHECK-LABEL: st_param_v4_i16
+; CHECK: st.param.v4.b16 [param0+0], {1, 2, 3, 4}
+; CHECK: st.param.v4.b16 [param0+0], {1, {{%rs[0-9]+}}, {{%rs[0-9]+}}, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b16 [param0+0], {{{%rs[0-9]+}}, 2, {{%rs[0-9]+}}, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b16 [param0+0], {{{%rs[0-9]+}}, {{%rs[0-9]+}}, 3, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b16 [param0+0], {{{%rs[0-9]+}}, {{%rs[0-9]+}}, {{%rs[0-9]+}}, 4}
+; CHECK: st.param.v4.b16 [param0+0], {1, 2, {{%rs[0-9]+}}, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b16 [param0+0], {1, {{%rs[0-9]+}}, 3, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b16 [param0+0], {1, {{%rs[0-9]+}}, {{%rs[0-9]+}}, 4}
+; CHECK: st.param.v4.b16 [param0+0], {{{%rs[0-9]+}}, 2, 3, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b16 [param0+0], {{{%rs[0-9]+}}, 2, {{%rs[0-9]+}}, 4}
+; CHECK: st.param.v4.b16 [param0+0], {{{%rs[0-9]+}}, {{%rs[0-9]+}}, 3, 4}
+; CHECK: st.param.v4.b16 [param0+0], {1, 2, 3, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b16 [param0+0], {1, 2, {{%rs[0-9]+}}, 4}
+; CHECK: st.param.v4.b16 [param0+0], {1, {{%rs[0-9]+}}, 3, 4}
+; CHECK: st.param.v4.b16 [param0+0], {{{%rs[0-9]+}}, 2, 3, 4}
+define void @st_param_v4_i16(i16 %a, i16 %b, i16 %c, i16 %d) {
+  call void @call_v4_i16(%struct.short4 { i16 1, i16 2, i16 3, i16 4 })
+
+  %struct.irrr0 = insertvalue %struct.short4 poison, i16 1, 0
+  %struct.irrr1 = insertvalue %struct.short4 %struct.irrr0, i16 %b, 1
+  %struct.irrr2 = insertvalue %struct.short4 %struct.irrr1, i16 %c, 2
+  %struct.irrr3 = insertvalue %struct.short4 %struct.irrr2, i16 %d, 3
+  call void @call_v4_i16(%struct.short4 %struct.irrr3)
+
+  %struct.rirr0 = insertvalue %struct.short4 poison, i16 %a, 0
+  %struct.rirr1 = insertvalue %struct.short4 %struct.rirr0, i16 2, 1
+  %struct.rirr2 = insertvalue %struct.short4 %struct.rirr1, i16 %c, 2
+  %struct.rirr3 = insertvalue %struct.short4 %struct.rirr2, i16 %d, 3
+  call void @call_v4_i16(%struct.short4 %struct.rirr3)
+
+  %struct.rrir0 = insertvalue %struct.short4 poison, i16 %a, 0
+  %struct.rrir1 = insertvalue %struct.short4 %struct.rrir0, i16 %b, 1
+  %struct.rrir2 = insertvalue %struct.short4 %struct.rrir1, i16 3, 2
+  %struct.rrir3 = insertvalue %struct.short4 %struct.rrir2, i16 %d, 3
+  call void @call_v4_i16(%struct.short4 %struct.rrir3)
+
+  %struct.rrri0 = insertvalue %struct.short4 poison, i16 %a, 0
+  %struct.rrri1 = insertvalue %struct.short4 %struct.rrri0, i16 %b, 1
+  %struct.rrri2 = insertvalue %struct.short4 %struct.rrri1, i16 %c, 2
+  %struct.rrri3 = insertvalue %struct.short4 %struct.rrri2, i16 4, 3
+  call void @call_v4_i16(%struct.short4 %struct.rrri3)
+
+  %struct.iirr0 = insertvalue %struct.short4 poison, i16 1, 0
+  %struct.iirr1 = insertvalue %struct.short4 %struct.iirr0, i16 2, 1
+  %struct.iirr2 = insertvalue %struct.short4 %struct.iirr1, i16 %c, 2
+  %struct.iirr3 = insertvalue %struct.short4 %struct.iirr2, i16 %d, 3
+  call void @call_v4_i16(%struct.short4 %struct.iirr3)
+
+  %struct.irir0 = insertvalue %struct.short4 poison, i16 1, 0
+  %struct.irir1 = insertvalue %struct.short4 %struct.irir0, i16 %b, 1
+  %struct.irir2 = insertvalue %struct.short4 %struct.irir1, i16 3, 2
+  %struct.irir3 = insertvalue %struct.short4 %struct.irir2, i16 %d, 3
+  call void @call_v4_i16(%struct.short4 %struct.irir3)
+
+  %struct.irri0 = insertvalue %struct.short4 poison, i16 1, 0
+  %struct.irri1 = insertvalue %struct.short4 %struct.irri0, i16 %b, 1
+  %struct.irri2 = insertvalue %struct.short4 %struct.irri1, i16 %c, 2
+  %struct.irri3 = insertvalue %struct.short4 %struct.irri2, i16 4, 3
+  call void @call_v4_i16(%struct.short4 %struct.irri3)
+
+  %struct.riir0 = insertvalue %struct.short4 poison, i16 %a, 0
+  %struct.riir1 = insertvalue %struct.short4 %struct.riir0, i16 2, 1
+  %struct.riir2 = insertvalue %struct.short4 %struct.riir1, i16 3, 2
+  %struct.riir3 = insertvalue %struct.short4 %struct.riir2, i16 %d, 3
+  call void @call_v4_i16(%struct.short4 %struct.riir3)
+
+  %struct.riri0 = insertvalue %struct.short4 poison, i16 %a, 0
+  %struct.riri1 = insertvalue %struct.short4 %struct.riri0, i16 2, 1
+  %struct.riri2 = insertvalue %struct.short4 %struct.riri1, i16 %c, 2
+  %struct.riri3 = insertvalue %struct.short4 %struct.riri2, i16 4, 3
+  call void @call_v4_i16(%struct.short4 %struct.riri3)
+
+  %struct.rrii0 = insertvalue %struct.short4 poison, i16 %a, 0
+  %struct.rrii1 = insertvalue %struct.short4 %struct.rrii0, i16 %b, 1
+  %struct.rrii2 = insertvalue %struct.short4 %struct.rrii1, i16 3, 2
+  %struct.rrii3 = insertvalue %struct.short4 %struct.rrii2, i16 4, 3
+  call void @call_v4_i16(%struct.short4 %struct.rrii3)
+
+  %struct.iiir0 = insertvalue %struct.short4 poison, i16 1, 0
+  %struct.iiir1 = insertvalue %struct.short4 %struct.iiir0, i16 2, 1
+  %struct.iiir2 = insertvalue %struct.short4 %struct.iiir1, i16 3, 2
+  %struct.iiir3 = insertvalue %struct.short4 %struct.iiir2, i16 %d, 3
+  call void @call_v4_i16(%struct.short4 %struct.iiir3)
+
+  %struct.iiri0 = insertvalue %struct.short4 poison, i16 1, 0
+  %struct.iiri1 = insertvalue %struct.short4 %struct.iiri0, i16 2, 1
+  %struct.iiri2 = insertvalue %struct.short4 %struct.iiri1, i16 %c, 2
+  %struct.iiri3 = insertvalue %struct.short4 %struct.iiri2, i16 4, 3
+  call void @call_v4_i16(%struct.short4 %struct.iiri3)
+
+  %struct.irii0 = insertvalue %struct.short4 poison, i16 1, 0
+  %struct.irii1 = insertvalue %struct.short4 %struct.irii0, i16 %b, 1
+  %struct.irii2 = insertvalue %struct.short4 %struct.irii1, i16 3, 2
+  %struct.irii3 = insertvalue %struct.short4 %struct.irii2, i16 4, 3
+  call void @call_v4_i16(%struct.short4 %struct.irii3)
+
+  %struct.riii0 = insertvalue %struct.short4 poison, i16 %a, 0
+  %struct.riii1 = insertvalue %struct.short4 %struct.riii0, i16 2, 1
+  %struct.riii2 = insertvalue %struct.short4 %struct.riii1, i16 3, 2
+  %struct.riii3 = insertvalue %struct.short4 %struct.riii2, i16 4, 3
+  call void @call_v4_i16(%struct.short4 %struct.riii3)
+  ret void
+}
+
+; CHECK-LABEL: st_param_v4_i32
+; CHECK: st.param.v4.b32 [param0+0], {1, 2, 3, 4}
+; CHECK: st.param.v4.b32 [param0+0], {1, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}
+; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, 2, {{%r[0-9]+}}, {{%r[0-9]+}}}
+; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, 3, {{%r[0-9]+}}}
+; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, 4}
+; CHECK: st.param.v4.b32 [param0+0], {1, 2, {{%r[0-9]+}}, {{%r[0-9]+}}}
+; CHECK: st.param.v4.b32 [param0+0], {1, {{%r[0-9]+}}, 3, {{%r[0-9]+}}}
+; CHECK: st.param.v4.b32 [param0+0], {1, {{%r[0-9]+}}, {{%r[0-9]+}}, 4}
+; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, 2, 3, {{%r[0-9]+}}}
+; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, 2, {{%r[0-9]+}}, 4}
+; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, 3, 4}
+; CHECK: st.param.v4.b32 [param0+0], {1, 2, 3, {{%r[0-9]+}}}
+; CHECK: st.param.v4.b32 [param0+0], {1, 2, {{%r[0-9]+}}, 4}
+; CHECK: st.param.v4.b32 [param0+0], {1, {{%r[0-9]+}}, 3, 4}
+; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, 2, 3, 4}
+define void @st_param_v4_i32(i32 %a, i32 %b, i32 %c, i32 %d) {
+  call void @call_v4_i32(%struct.int4 { i32 1, i32 2, i32 3, i32 4 })
+
+  %struct.irrr0 = insertvalue %struct.int4 poison, i32 1, 0
+  %struct.irrr1 = insertvalue %struct.int4 %struct.irrr0, i32 %b, 1
+  %struct.irrr2 = insertvalue %struct.int4 %struct.irrr1, i32 %c, 2
+  %struct.irrr3 = insertvalue %struct.int4 %struct.irrr2, i32 %d, 3
+  call void @call_v4_i32(%struct.int4 %struct.irrr3)
+
+  %struct.rirr0 = insertvalue %struct.int4 poison, i32 %a, 0
+  %struct.rirr1 = insertvalue %struct.int4 %struct.rirr0, i32 2, 1
+  %struct.rirr2 = insertvalue %struct.int4 %struct.rirr1, i32 %c, 2
+  %struct.rirr3 = insertvalue %struct.int4 %struct.rirr2, i32 %d, 3
+  call void @call_v4_i32(%struct.int4 %struct.rirr3)
+
+  %struct.rrir0 = insertvalue %struct.int4 poison, i32 %a, 0
+  %struct.rrir1 = insertvalue %struct.int4 %struct.rrir0, i32 %b, 1
+  %struct.rrir2 = insertvalue %struct.int4 %struct.rrir1, i32 3, 2
+  %struct.rrir3 = insertvalue %struct.int4 %struct.rrir2, i32 %d, 3
+  call void @call_v4_i32(%struct.int4 %struct.rrir3)
+
+  %struct.rrri0 = insertvalue %struct.int4 poison, i32 %a, 0
+  %struct.rrri1 = insertvalue %struct.int4 %struct.rrri0, i32 %b, 1
+  %struct.rrri2 = insertvalue %struct.int4 %struct.rrri1, i32 %c, 2
+  %struct.rrri3 = insertvalue %struct.int4 %struct.rrri2, i32 4, 3
+  call void @call_v4_i32(%struct.int4 %struct.rrri3)
+
+  %struct.iirr0 = insertvalue %struct.int4 poison, i32 1, 0
+  %struct.iirr1 = insertvalue %struct.int4 %struct.iirr0, i32 2, 1
+  %struct.iirr2 = insertvalue %struct.int4 %struct.iirr1, i32 %c, 2
+  %struct.iirr3 = insertvalue %struct.int4 %struct.iirr2, i32 %d, 3
+  call void @call_v4_i32(%struct.int4 %struct.iirr3)
+
+  %struct.irir0 = insertvalue %struct.int4 poison, i32 1, 0
+  %struct.irir1 = insertvalue %struct.int4 %struct.irir0, i32 %b, 1
+  %struct.irir2 = insertvalue %struct.int4 %struct.irir1, i32 3, 2
+  %struct.irir3 = insertvalue %struct.int4 %struct.irir2, i32 %d, 3
+  call void @call_v4_i32(%struct.int4 %struct.irir3)
+
+  %struct.irri0 = insertvalue %struct.int4 poison, i32 1, 0
+  %struct.irri1 = insertvalue %struct.int4 %struct.irri0, i32 %b, 1
+  %struct.irri2 = insertvalue %struct.int4 %struct.irri1, i32 %c, 2
+  %struct.irri3 = insertvalue %struct.int4 %struct.irri2, i32 4, 3
+  call void @call_v4_i32(%struct.int4 %struct.irri3)
+
+  %struct.riir0 = insertvalue %struct.int4 poison, i32 %a, 0
+  %struct.riir1 = insertvalue %struct.int4 %struct.riir0, i32 2, 1
+  %struct.riir2 = insertvalue %struct.int4 %struct.riir1, i32 3, 2
+  %struct.riir3 = insertvalue %struct.int4 %struct.riir2, i32 %d, 3
+  call void @call_v4_i32(%struct.int4 %struct.riir3)
+
+  %struct.riri0 = insertvalue %struct.int4 poison, i32 %a, 0
+  %struct.riri1 = insertvalue %struct.int4 %struct.riri0, i32 2, 1
+  %struct.riri2 = insertvalue %struct.int4 %struct.riri1, i32 %c, 2
+  %struct.riri3 = insertvalue %struct.int4 %struct.riri2, i32 4, 3
+  call void @call_v4_i32(%struct.int4 %struct.riri3)
+
+  %struct.rrii0 = insertvalue %struct.int4 poison, i32 %a, 0
+  %struct.rrii1 = insertvalue %struct.int4 %struct.rrii0, i32 %b, 1
+  %struct.rrii2 = insertvalue %struct.int4 %struct.rrii1, i32 3, 2
+  %struct.rrii3 = insertvalue %struct.int4 %struct.rrii2, i32 4, 3
+  call void @call_v4_i32(%struct.int4 %struct.rrii3)
+
+  %struct.iiir0 = insertvalue %struct.int4 poison, i32 1, 0
+  %struct.iiir1 = insertvalue %struct.int4 %struct.iiir0, i32 2, 1
+  %struct.iiir2 = insertvalue %struct.int4 %struct.iiir1, i32 3, 2
+  %struct.iiir3 = insertvalue %struct.int4 %struct.iiir2, i32 %d, 3
+  call void @call_v4_i32(%struct.int4 %struct.iiir3)
+
+  %struct.iiri0 = insertvalue %struct.int4 poison, i32 1, 0
+  %struct.iiri1 = insertvalue %struct.int4 %struct.iiri0, i32 2, 1
+  %struct.iiri2 = insertvalue %struct.int4 %struct.iiri1, i32 %c, 2
+  %struct.iiri3 = insertvalue %struct.int4 %struct.iiri2, i32 4, 3
+  call void @call_v4_i32(%struct.int4 %struct.iiri3)
+
+  %struct.irii0 = insertvalue %struct.int4 poison, i32 1, 0
+  %struct.irii1 = insertvalue %struct.int4 %struct.irii0, i32 %b, 1
+  %struct.irii2 = insertvalue %struct.int4 %struct.irii1, i32 3, 2
+  %struct.irii3 = insertvalue %struct.int4 %struct.irii2, i32 4, 3
+  call void @call_v4_i32(%struct.int4 %struct.irii3)
+
+  %struct.riii0 = insertvalue %struct.int4 poison, i32 %a, 0
+  %struct.riii1 = insertvalue %struct.int4 %struct.riii0, i32 2, 1
+  %struct.riii2 = insertvalue %struct.int4 %struct.riii1, i32 3, 2
+  %struct.riii3 = insertvalue %struct.int4 %struct.riii2, i32 4, 3
+  call void @call_v4_i32(%struct.int4 %struct.riii3)
+  ret void
+}
+
+; CHECK-LABEL: st_param_v4_f32
+; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, 0f40400000, 0f40800000}
+; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, {{%f[0-9]+}}, {{%f[0-9]+}}, {{%f[0-9]+}}}
+; CHECK: st.param.v4.f32 [param0+0], {{{%f[0-9]+}}, 0f40000000, {{%f[0-9]+}}, {{%f[0-9]+}}}
+; CHECK: st.param.v4.f32 [param0+0], {{{%f[0-9]+}}, {{%f[0-9]+}}, 0f40400000, {{%f[0-9]+}}}
+; CHECK: st.param.v4.f32 [param0+0], {{{%f[0-9]+}}, {{%f[0-9]+}}, {{%f[0-9]+}}, 0f40800000}
+; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, {{%f[0-9]+}}, {{%f[0-9]+}}}
+; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, {{%f[0-9]+}}, 0f40400000, {{%f[0-9]+}}}
+; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, {{%f[0-9]+}}, {{%f[0-9]+}}, 0f40800000}
+; CHECK: st.param.v4.f32 [param0+0], {{{%f[0-9]+}}, 0f40000000, 0f40400000, {{%f[0-9]+}}}
+; CHECK: st.param.v4.f32 [param0+0], {{{%f[0-9]+}}, 0f40000000, {{%f[0-9]+}}, 0f40800000}
+; CHECK: st.param.v4.f32 [param0+0], {{{%f[0-9]+}}, {{%f[0-9]+}}, 0f40400000, 0f40800000}
+; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, 0f40400000, {{%f[0-9]+}}}
+; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, {{%f[0-9]+}}, 0f40800000}
+; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, {{%f[0-9]+}}, 0f40400000, 0f40800000}
+; CHECK: st.param.v4.f32 [param0+0], {{{%f[0-9]+}}, 0f40000000, 0f40400000, 0f40800000}
+define void @st_param_v4_f32(float %a, float %b, float %c, float %d) {
+  call void @call_v4_f32(%struct.float4 { float 1.0, float 2.0, float 3.0, float 4.0 })
+
+  %struct.irrr0 = insertvalue %struct.float4 poison, float 1.0, 0
+  %struct.irrr1 = insertvalue %struct.float4 %struct.irrr0, float %b, 1
+  %struct.irrr2 = insertvalue %struct.float4 %struct.irrr1, float %c, 2
+  %struct.irrr3 = insertvalue %struct.float4 %struct.irrr2, float %d, 3
+  call void @call_v4_f32(%struct.float4 %struct.irrr3)
+
+  %struct.rirr0 = insertvalue %struct.float4 poison, float %a, 0
+  %struct.rirr1 = insertvalue %struct.float4 %struct.rirr0, float 2.0, 1
+  %struct.rirr2 = insertvalue %struct.float4 %struct.rirr1, float %c, 2
+  %struct.rirr3 = insertvalue %struct.float4 %struct.rirr2, float %d, 3
+  call void @call_v4_f32(%struct.float4 %struct.rirr3)
+
+  %struct.rrir0 = insertvalue %struct.float4 poison, float %a, 0
+  %struct.rrir1 = insertvalue %struct.float4 %struct.rrir0, float %b, 1
+  %struct.rrir2 = insertvalue %struct.float4 %struct.rrir1, float 3.0, 2
+  %struct.rrir3 = insertvalue %struct.float4 %struct.rrir2, float %d, 3
+  call void @call_v4_f32(%struct.float4 %struct.rrir3)
+
+  %struct.rrri0 = insertvalue %struct.float4 poison, float %a, 0
+  %struct.rrri1 = insertvalue %struct.float4 %struct.rrri0, float %b, 1
+  %struct.rrri2 = insertvalue %struct.float4 %struct.rrri1, float %c, 2
+  %struct.rrri3 = insertvalue %struct.float4 %struct.rrri2, float 4.0, 3
+  call void @call_v4_f32(%struct.float4 %struct.rrri3)
+
+  %struct.iirr0 = insertvalue %struct.float4 poison, float 1.0, 0
+  %struct.iirr1 = insertvalue %struct.float4 %struct.iirr0, float 2.0, 1
+  %struct.iirr2 = insertvalue %struct.float4 %struct.iirr1, float %c, 2
+  %struct.iirr3 = insertvalue %struct.float4 %struct.iirr2, float %d, 3
+  call void @call_v4_f32(%struct.float4 %struct.iirr3)
+
+  %struct.irir0 = insertvalue %struct.float4 poison, float 1.0, 0
+  %struct.irir1 = insertvalue %struct.float4 %struct.irir0, float %b, 1
+  %struct.irir2 = insertvalue %struct.float4 %struct.irir1, float 3.0, 2
+  %struct.irir3 = insertvalue %struct.float4 %struct.irir2, float %d, 3
+  call void @call_v4_f32(%struct.float4 %struct.irir3)
+
+  %struct.irri0 = insertvalue %struct.float4 poison, float 1.0, 0
+  %struct.irri1 = insertvalue %struct.float4 %struct.irri0, float %b, 1
+  %struct.irri2 = insertvalue %struct.float4 %struct.irri1, float %c, 2
+  %struct.irri3 = insertvalue %struct.float4 %struct.irri2, float 4.0, 3
+  call void @call_v4_f32(%struct.float4 %struct.irri3)
+
+  %struct.riir0 = insertvalue %struct.float4 poison, float %a, 0
+  %struct.riir1 = insertvalue %struct.float4 %struct.riir0, float 2.0, 1
+  %struct.riir2 = insertvalue %struct.float4 %struct.riir1, float 3.0, 2
+  %struct.riir3 = insertvalue %struct.float4 %struct.riir2, float %d, 3
+  call void @call_v4_f32(%struct.float4 %struct.riir3)
+
+  %struct.riri0 = insertvalue %struct.float4 poison, float %a, 0
+  %struct.riri1 = insertvalue %struct.float4 %struct.riri0, float 2.0, 1
+  %struct.riri2 = insertvalue %struct.float4 %struct.riri1, float %c, 2
+  %struct.riri3 = insertvalue %struct.float4 %struct.riri2, float 4.0, 3
+  call void @call_v4_f32(%struct.float4 %struct.riri3)
+
+  %struct.rrii0 = insertvalue %struct.float4 poison, float %a, 0
+  %struct.rrii1 = insertvalue %struct.float4 %struct.rrii0, float %b, 1
+  %struct.rrii2 = insertvalue %struct.float4 %struct.rrii1, float 3.0, 2
+  %struct.rrii3 = insertvalue %struct.float4 %struct.rrii2, float 4.0, 3
+  call void @call_v4_f32(%struct.float4 %struct.rrii3)
+
+  %struct.iiir0 = insertvalue %struct.float4 poison, float 1.0, 0
+  %struct.iiir1 = insertvalue %struct.float4 %struct.iiir0, float 2.0, 1
+  %struct.iiir2 = insertvalue %struct.float4 %struct.iiir1, float 3.0, 2
+  %struct.iiir3 = insertvalue %struct.float4 %struct.iiir2, float %d, 3
+  call void @call_v4_f32(%struct.float4 %struct.iiir3)
+
+  %struct.iiri0 = insertvalue %struct.float4 poison, float 1.0, 0
+  %struct.iiri1 = insertvalue %struct.float4 %struct.iiri0, float 2.0, 1
+  %struct.iiri2 = insertvalue %struct.float4 %struct.iiri1, float %c, 2
+  %struct.iiri3 = insertvalue %struct.float4 %struct.iiri2, float 4.0, 3
+  call void @call_v4_f32(%struct.float4 %struct.iiri3)
+
+  %struct.irii0 = insertvalue %struct.float4 poison, float 1.0, 0
+  %struct.irii1 = insertvalue %struct.float4 %struct.irii0, float %b, 1
+  %struct.irii2 = insertvalue %struct.float4 %struct.irii1, float 3.0, 2
+  %struct.irii3 = insertvalue %struct.float4 %struct.irii2, float 4.0, 3
+  call void @call_v4_f32(%struct.float4 %struct.irii3)
+
+  %struct.riii0 = insertvalue %struct.float4 poison, float %a, 0
+  %struct.riii1 = insertvalue %struct.float4 %struct.riii0, float 2.0, 1
+  %struct.riii2 = insertvalue %struct.float4 %struct.riii1, float 3.0, 2
+  %struct.riii3 = insertvalue %struct.float4 %struct.riii2, float 4.0, 3
+  call void @call_v4_f32(%struct.float4 %struct.riii3)
+  ret void
+}
+
+declare void @call_v4_i8(%struct.char4)
+declare void @call_v4_i16(%struct.short4)
+declare void @call_v4_i32(%struct.int4)
+declare void @call_v4_f32(%struct.float4)
+
+!nvvm.annotations = !{!1, !2, !3, !4, !5, !6, !7, !8, !9, !10}
+!1 = !{ptr @call_v2_i8, !"align", i32 65538}
+!2 = !{ptr @call_v2_i16, !"align", i32 65540}
+!3 = !{ptr @call_v2_i32, !"align", i32 65544}
+!4 = !{ptr @call_v2_i64, !"align", i32 65552}
+!5 = !{ptr @call_v2_f32, !"align", i32 65544}
+!6 = !{ptr @call_v2_f64, !"align", i32 65552}
+!7 = !{ptr @call_v4_i8, !"align", i32 65540}
+!8 = !{ptr @call_v4_i16, !"align", i32 65544}
+!9 = !{ptr @call_v4_i32, !"align", i32 65552}
+!10 = !{ptr @call_v4_f32, !"align", i32 65552}

>From f7405a63ef82009089ff15e97c05ea5b87edb1ed Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean at nvidia.com>
Date: Thu, 9 May 2024 15:52:58 +0000
Subject: [PATCH 2/4] fixup and address comments

---
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp |  66 +-
 llvm/test/CodeGen/NVPTX/st-param-imm.ll     | 962 ++++++++++++++++++--
 2 files changed, 903 insertions(+), 125 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index fc9d760093d66..7e7d167d6a2c7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -2183,25 +2183,29 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
 }
 
 // Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri)
-#define getOpcV2H(ty, op0, op1) NVPTX::StoreParamV2##ty##_##op0##op1
+#define getOpcV2H(ty, opKind0, opKind1)                                        \
+  NVPTX::StoreParamV2##ty##_##opKind0##opKind1
 
-#define getOpcV2H1(ty, op0, op1)                                               \
-  (op1) ? getOpcV2H(ty, op0, i) : getOpcV2H(ty, op0, r)
+#define getOpcV2H1(ty, opKind0, isImm1)                                        \
+  (isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r)
 
 #define getOpcodeForVectorStParamV2(ty, isimm)                                 \
   (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1])
 
-#define getOpcV4H(ty, op0, op1, op2, op3)                                      \
-  NVPTX::StoreParamV4##ty##_##op0##op1##op2##op3
+#define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3)                      \
+  NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3
 
-#define getOpcV4H3(ty, op0, op1, op2, op3)                                     \
-  (op3) ? getOpcV4H(ty, op0, op1, op2, i) : getOpcV4H(ty, op0, op1, op2, r)
+#define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3)                      \
+  (isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i)                       \
+           : getOpcV4H(ty, opKind0, opKind1, opKind2, r)
 
-#define getOpcV4H2(ty, op0, op1, op2, op3)                                     \
-  (op2) ? getOpcV4H3(ty, op0, op1, i, op3) : getOpcV4H3(ty, op0, op1, r, op3)
+#define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3)                       \
+  (isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3)                       \
+           : getOpcV4H3(ty, opKind0, opKind1, r, isImm3)
 
-#define getOpcV4H1(ty, op0, op1, op2, op3)                                     \
-  (op1) ? getOpcV4H2(ty, op0, i, op2, op3) : getOpcV4H2(ty, op0, r, op2, op3)
+#define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3)                        \
+  (isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3)                        \
+           : getOpcV4H2(ty, opKind0, r, isImm2, isImm3)
 
 #define getOpcodeForVectorStParamV4(ty, isimm)                                 \
   (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3])                 \
@@ -2211,10 +2215,10 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
   (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm)                            \
            : getOpcodeForVectorStParamV4(ty, isimm)
 
-static std::optional<unsigned>
-pickOpcodeForVectorStParam(SmallVector<SDValue, 8> &Ops, unsigned NumElts,
-                           MVT::SimpleValueType MemTy, SelectionDAG *CurDAG,
-                           SDLoc DL) {
+static unsigned pickOpcodeForVectorStParam(SmallVector<SDValue, 8> &Ops,
+                                           unsigned NumElts,
+                                           MVT::SimpleValueType MemTy,
+                                           SelectionDAG *CurDAG, SDLoc DL) {
   // Determine which inputs are registers and immediates make new operators
   // with constant values
   SmallVector<bool, 4> IsImm(NumElts, false);
@@ -2244,19 +2248,31 @@ pickOpcodeForVectorStParam(SmallVector<SDValue, 8> &Ops, unsigned NumElts,
   case MVT::i32:
     return getOpcodeForVectorStParam(NumElts, I32, IsImm);
   case MVT::i64:
-    if (NumElts == 4)
-      return std::nullopt;
+    assert(NumElts == 2 && "MVT too large for NumElts > 2");
     return getOpcodeForVectorStParamV2(I64, IsImm);
   case MVT::f32:
     return getOpcodeForVectorStParam(NumElts, F32, IsImm);
   case MVT::f64:
-    if (NumElts == 4)
-      return std::nullopt;
+    assert(NumElts == 2 && "MVT too large for NumElts > 2");
     return getOpcodeForVectorStParamV2(F64, IsImm);
+
+  // These cases don't support immediates, just use the all register version
+  // and generate moves.
+  case MVT::i1:
+    return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr
+                          : NVPTX::StoreParamV4I8_rrrr;
   case MVT::f16:
+  case MVT::bf16:
+    return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr
+                          : NVPTX::StoreParamV4I16_rrrr;
   case MVT::v2f16:
+  case MVT::v2bf16:
+  case MVT::v2i16:
+  case MVT::v4i8:
+    return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr
+                          : NVPTX::StoreParamV4I32_rrrr;
   default:
-    return std::nullopt;
+    llvm_unreachable("Cannot select st.param for unknown MemTy");
   }
 }
 
@@ -2271,10 +2287,10 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
   SDValue Glue = N->getOperand(N->getNumOperands() - 1);
 
   // How many elements do we have?
-  unsigned NumElts = 1;
+  unsigned NumElts;
   switch (N->getOpcode()) {
   default:
-    return false;
+    llvm_unreachable("Unexpected opcode");
   case NVPTXISD::StoreParamU32:
   case NVPTXISD::StoreParamS32:
   case NVPTXISD::StoreParam:
@@ -2300,12 +2316,12 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
   // Determine target opcode
   // If we have an i1, use an 8-bit store. The lowering code in
   // NVPTXISelLowering will have already emitted an upcast.
-  std::optional<unsigned> Opcode = 0;
+  std::optional<unsigned> Opcode;
   switch (N->getOpcode()) {
   default:
     switch (NumElts) {
     default:
-      return false;
+      llvm_unreachable("Unexpected NumElts");
     case 1: {
       MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;
       SDValue Imm = Ops[0];
@@ -2357,8 +2373,6 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
       break;
     }
     }
-    if (!Opcode)
-      return false;
     break;
   // Special case: if we have a sign-extend/zero-extend node, insert the
   // conversion instruction first, and use that as the value operand to
diff --git a/llvm/test/CodeGen/NVPTX/st-param-imm.ll b/llvm/test/CodeGen/NVPTX/st-param-imm.ll
index 022035505e2a7..fd74218d3dd94 100644
--- a/llvm/test/CodeGen/NVPTX/st-param-imm.ll
+++ b/llvm/test/CodeGen/NVPTX/st-param-imm.ll
@@ -1,8 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s -march=nvptx64 | FileCheck %s
 ; RUN: llc < %s -march=nvptx | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -march=nvptx -verify-machineinstrs | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -verify-machineinstrs | %ptxas-verify %}
 
+target triple = "nvptx64-nvidia-cuda"
+
 %struct.A = type { i8, i16 }
 %struct.char2 = type { i8, i8 }
 %struct.char4 = type { i8, i8, i8, i8 }
@@ -15,38 +18,103 @@
 %struct.float4 = type { float, float, float, float }
 %struct.double2 = type { double, double }
 
-; CHECK-LABEL: st_param_i8_i16
-; CHECK: st.param.b8 [param0+0], 1
-; CHECK: st.param.b16 [param0+2], 2
 define void @st_param_i8_i16() {
+; CHECK-LABEL: st_param_i8_i16(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    { // callseq 0, 0
+; CHECK-NEXT:    .param .align 2 .b8 param0[4];
+; CHECK-NEXT:    st.param.b8 [param0+0], 1;
+; CHECK-NEXT:    st.param.b16 [param0+2], 2;
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_i8_i16,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 0
+; CHECK-NEXT:    ret;
   call void @call_i8_i16(%struct.A { i8 1, i16 2 })
   ret void
 }
 
-; CHECK-LABEL: st_param_i32
-; CHECK: st.param.b32 [param0+0], 3
 define void @st_param_i32() {
+; CHECK-LABEL: st_param_i32(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    { // callseq 1, 0
+; CHECK-NEXT:    .param .b32 param0;
+; CHECK-NEXT:    st.param.b32 [param0+0], 3;
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_i32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 1
+; CHECK-NEXT:    ret;
   call void @call_i32(i32 3)
   ret void
 }
 
-; CHECK-LABEL: st_param_i64
-; CHECK: st.param.b64 [param0+0], 4
 define void @st_param_i64() {
+; CHECK-LABEL: st_param_i64(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    { // callseq 2, 0
+; CHECK-NEXT:    .param .b64 param0;
+; CHECK-NEXT:    st.param.b64 [param0+0], 4;
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_i64,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 2
+; CHECK-NEXT:    ret;
   call void @call_i64(i64 4)
   ret void
 }
 
-; CHECK-LABEL: st_param_f32
-; CHECK: st.param.f32 [param0+0], 0f40A00000
 define void @st_param_f32() {
+; CHECK-LABEL: st_param_f32(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    { // callseq 3, 0
+; CHECK-NEXT:    .param .b32 param0;
+; CHECK-NEXT:    st.param.f32 [param0+0], 0f40A00000;
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_f32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 3
+; CHECK-NEXT:    ret;
   call void @call_f32(float 5.0)
   ret void
 }
 
-; CHECK-LABEL: st_param_f64
-; CHECK: st.param.f64 [param0+0], 0d4018000000000000
 define void @st_param_f64() {
+; CHECK-LABEL: st_param_f64(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    { // callseq 4, 0
+; CHECK-NEXT:    .param .b64 param0;
+; CHECK-NEXT:    st.param.f64 [param0+0], 0d4018000000000000;
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_f64,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 4
+; CHECK-NEXT:    ret;
   call void @call_f64(double 6.0)
   ret void
 }
@@ -57,11 +125,41 @@ declare void @call_i64(i64)
 declare void @call_f32(float)
 declare void @call_f64(double)
 
-; CHECK-LABEL: st_param_v2_i8
-; CHECK: st.param.v2.b8 [param0+0], {1, 2}
-; CHECK: st.param.v2.b8 [param0+0], {1, {{%rs[0-9]+}}}
-; CHECK: st.param.v2.b8 [param0+0], {{{%rs[0-9]+}}, 2}
 define void @st_param_v2_i8(i8 %val) {
+; CHECK-LABEL: st_param_v2_i8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v2_i8_param_0];
+; CHECK-NEXT:    { // callseq 5, 0
+; CHECK-NEXT:    .param .align 2 .b8 param0[2];
+; CHECK-NEXT:    st.param.v2.b8 [param0+0], {1, 2};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v2_i8,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 5
+; CHECK-NEXT:    { // callseq 6, 0
+; CHECK-NEXT:    .param .align 2 .b8 param0[2];
+; CHECK-NEXT:    st.param.v2.b8 [param0+0], {1, %rs1};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v2_i8,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 6
+; CHECK-NEXT:    { // callseq 7, 0
+; CHECK-NEXT:    .param .align 2 .b8 param0[2];
+; CHECK-NEXT:    st.param.v2.b8 [param0+0], {%rs1, 2};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v2_i8,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 7
+; CHECK-NEXT:    ret;
   call void @call_v2_i8(%struct.char2 { i8 1, i8 2 })
   %struct.ir0 = insertvalue %struct.char2 poison, i8 1, 0
   %struct.ir1 = insertvalue %struct.char2 %struct.ir0, i8 %val, 1
@@ -72,11 +170,41 @@ define void @st_param_v2_i8(i8 %val) {
   ret void
 }
 
-; CHECK-LABEL: st_param_v2_i16
-; CHECK: st.param.v2.b16 [param0+0], {1, 2}
-; CHECK: st.param.v2.b16 [param0+0], {1, {{%rs[0-9]+}}}
-; CHECK: st.param.v2.b16 [param0+0], {{{%rs[0-9]+}}, 2}
 define void @st_param_v2_i16(i16 %val) {
+; CHECK-LABEL: st_param_v2_i16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v2_i16_param_0];
+; CHECK-NEXT:    { // callseq 8, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.v2.b16 [param0+0], {1, 2};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v2_i16,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 8
+; CHECK-NEXT:    { // callseq 9, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.v2.b16 [param0+0], {1, %rs1};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v2_i16,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 9
+; CHECK-NEXT:    { // callseq 10, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.v2.b16 [param0+0], {%rs1, 2};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v2_i16,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 10
+; CHECK-NEXT:    ret;
   call void @call_v2_i16(%struct.short2 { i16 1, i16 2 })
   %struct.ir0 = insertvalue %struct.short2 poison, i16 1, 0
   %struct.ir1 = insertvalue %struct.short2 %struct.ir0, i16 %val, 1
@@ -87,11 +215,41 @@ define void @st_param_v2_i16(i16 %val) {
   ret void
 }
 
-; CHECK-LABEL: st_param_v2_i32
-; CHECK: st.param.v2.b32 [param0+0], {1, 2}
-; CHECK: st.param.v2.b32 [param0+0], {1, {{%r[0-9]+}}}
-; CHECK: st.param.v2.b32 [param0+0], {{{%r[0-9]+}}, 2}
 define void @st_param_v2_i32(i32 %val) {
+; CHECK-LABEL: st_param_v2_i32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v2_i32_param_0];
+; CHECK-NEXT:    { // callseq 11, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.v2.b32 [param0+0], {1, 2};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v2_i32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 11
+; CHECK-NEXT:    { // callseq 12, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.v2.b32 [param0+0], {1, %r1};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v2_i32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 12
+; CHECK-NEXT:    { // callseq 13, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.v2.b32 [param0+0], {%r1, 2};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v2_i32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 13
+; CHECK-NEXT:    ret;
   call void @call_v2_i32(%struct.int2 { i32 1, i32 2 })
   %struct.ir0 = insertvalue %struct.int2 poison, i32 1, 0
   %struct.ir1 = insertvalue %struct.int2 %struct.ir0, i32 %val, 1
@@ -102,11 +260,41 @@ define void @st_param_v2_i32(i32 %val) {
   ret void
 }
 
-; CHECK-LABEL: st_param_v2_i64
-; CHECK: st.param.v2.b64 [param0+0], {1, 2}
-; CHECK: st.param.v2.b64 [param0+0], {1, {{%rd[0-9]+}}}
-; CHECK: st.param.v2.b64 [param0+0], {{{%rd[0-9]+}}, 2}
 define void @st_param_v2_i64(i64 %val) {
+; CHECK-LABEL: st_param_v2_i64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [st_param_v2_i64_param_0];
+; CHECK-NEXT:    { // callseq 14, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v2.b64 [param0+0], {1, 2};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v2_i64,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 14
+; CHECK-NEXT:    { // callseq 15, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v2.b64 [param0+0], {1, %rd1};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v2_i64,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 15
+; CHECK-NEXT:    { // callseq 16, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v2.b64 [param0+0], {%rd1, 2};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v2_i64,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 16
+; CHECK-NEXT:    ret;
   call void @call_v2_i64(%struct.longlong2 { i64 1, i64 2 })
   %struct.ir0 = insertvalue %struct.longlong2 poison, i64 1, 0
   %struct.ir1 = insertvalue %struct.longlong2 %struct.ir0, i64 %val, 1
@@ -117,11 +305,41 @@ define void @st_param_v2_i64(i64 %val) {
   ret void
 }
 
-; CHECK-LABEL: st_param_v2_f32
-; CHECK: st.param.v2.f32 [param0+0], {0f3F800000, 0f40000000}
-; CHECK: st.param.v2.f32 [param0+0], {0f3F800000, {{%f[0-9]+}}}
-; CHECK: st.param.v2.f32 [param0+0], {{{%f[0-9]+}}, 0f40000000}
 define void @st_param_v2_f32(float %val) {
+; CHECK-LABEL: st_param_v2_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v2_f32_param_0];
+; CHECK-NEXT:    { // callseq 17, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.v2.f32 [param0+0], {0f3F800000, 0f40000000};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v2_f32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 17
+; CHECK-NEXT:    { // callseq 18, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.v2.f32 [param0+0], {0f3F800000, %f1};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v2_f32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 18
+; CHECK-NEXT:    { // callseq 19, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.v2.f32 [param0+0], {%f1, 0f40000000};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v2_f32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 19
+; CHECK-NEXT:    ret;
   call void @call_v2_f32(%struct.float2 { float 1.0, float 2.0 })
   %struct.ir0 = insertvalue %struct.float2 poison, float 1.0, 0
   %struct.ir1 = insertvalue %struct.float2 %struct.ir0, float %val, 1
@@ -132,11 +350,41 @@ define void @st_param_v2_f32(float %val) {
   ret void
 }
 
-; CHECK-LABEL: st_param_v2_f64
-; CHECK: st.param.v2.f64 [param0+0], {0d3FF0000000000000, 0d4000000000000000}
-; CHECK: st.param.v2.f64 [param0+0], {0d3FF0000000000000, {{%fd[0-9]+}}}
-; CHECK: st.param.v2.f64 [param0+0], {{{%fd[0-9]+}}, 0d4000000000000000}
 define void @st_param_v2_f64(double %val) {
+; CHECK-LABEL: st_param_v2_f64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [st_param_v2_f64_param_0];
+; CHECK-NEXT:    { // callseq 20, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v2.f64 [param0+0], {0d3FF0000000000000, 0d4000000000000000};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v2_f64,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 20
+; CHECK-NEXT:    { // callseq 21, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v2.f64 [param0+0], {0d3FF0000000000000, %fd1};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v2_f64,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 21
+; CHECK-NEXT:    { // callseq 22, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v2.f64 [param0+0], {%fd1, 0d4000000000000000};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v2_f64,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 22
+; CHECK-NEXT:    ret;
   call void @call_v2_f64(%struct.double2 { double 1.0, double 2.0 })
   %struct.ir0 = insertvalue %struct.double2 poison, double 1.0, 0
   %struct.ir1 = insertvalue %struct.double2 %struct.ir0, double %val, 1
@@ -154,23 +402,152 @@ declare void @call_v2_i64(%struct.longlong2)
 declare void @call_v2_f32(%struct.float2)
 declare void @call_v2_f64(%struct.double2)
 
-; CHECK-LABEL: st_param_v4_i8
-; CHECK: st.param.v4.b8 [param0+0], {1, 2, 3, 4}
-; CHECK: st.param.v4.b8 [param0+0], {1, {{%rs[0-9]+}}, {{%rs[0-9]+}}, {{%rs[0-9]+}}}
-; CHECK: st.param.v4.b8 [param0+0], {{{%rs[0-9]+}}, 2, {{%rs[0-9]+}}, {{%rs[0-9]+}}}
-; CHECK: st.param.v4.b8 [param0+0], {{{%rs[0-9]+}}, {{%rs[0-9]+}}, 3, {{%rs[0-9]+}}}
-; CHECK: st.param.v4.b8 [param0+0], {{{%rs[0-9]+}}, {{%rs[0-9]+}}, {{%rs[0-9]+}}, 4}
-; CHECK: st.param.v4.b8 [param0+0], {1, 2, {{%rs[0-9]+}}, {{%rs[0-9]+}}}
-; CHECK: st.param.v4.b8 [param0+0], {1, {{%rs[0-9]+}}, 3, {{%rs[0-9]+}}}
-; CHECK: st.param.v4.b8 [param0+0], {1, {{%rs[0-9]+}}, {{%rs[0-9]+}}, 4}
-; CHECK: st.param.v4.b8 [param0+0], {{{%rs[0-9]+}}, 2, 3, {{%rs[0-9]+}}}
-; CHECK: st.param.v4.b8 [param0+0], {{{%rs[0-9]+}}, 2, {{%rs[0-9]+}}, 4}
-; CHECK: st.param.v4.b8 [param0+0], {{{%rs[0-9]+}}, {{%rs[0-9]+}}, 3, 4}
-; CHECK: st.param.v4.b8 [param0+0], {1, 2, 3, {{%rs[0-9]+}}}
-; CHECK: st.param.v4.b8 [param0+0], {1, 2, {{%rs[0-9]+}}, 4}
-; CHECK: st.param.v4.b8 [param0+0], {1, {{%rs[0-9]+}}, 3, 4}
-; CHECK: st.param.v4.b8 [param0+0], {{{%rs[0-9]+}}, 2, 3, 4}
 define void @st_param_v4_i8(i8 %a, i8 %b, i8 %c, i8 %d) {
+; CHECK-LABEL: st_param_v4_i8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_param_0];
+; CHECK-NEXT:    { // callseq 23, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, 2, 3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i8,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 23
+; CHECK-NEXT:    ld.param.u8 %rs2, [st_param_v4_i8_param_1];
+; CHECK-NEXT:    ld.param.u8 %rs3, [st_param_v4_i8_param_2];
+; CHECK-NEXT:    ld.param.u8 %rs4, [st_param_v4_i8_param_3];
+; CHECK-NEXT:    { // callseq 24, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, %rs2, %rs3, %rs4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i8,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 24
+; CHECK-NEXT:    { // callseq 25, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {%rs1, 2, %rs3, %rs4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i8,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 25
+; CHECK-NEXT:    { // callseq 26, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {%rs1, %rs2, 3, %rs4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i8,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 26
+; CHECK-NEXT:    { // callseq 27, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {%rs1, %rs2, %rs3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i8,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 27
+; CHECK-NEXT:    { // callseq 28, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, 2, %rs3, %rs4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i8,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 28
+; CHECK-NEXT:    { // callseq 29, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, %rs2, 3, %rs4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i8,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 29
+; CHECK-NEXT:    { // callseq 30, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, %rs2, %rs3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i8,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 30
+; CHECK-NEXT:    { // callseq 31, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {%rs1, 2, 3, %rs4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i8,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 31
+; CHECK-NEXT:    { // callseq 32, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {%rs1, 2, %rs3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i8,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 32
+; CHECK-NEXT:    { // callseq 33, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {%rs1, %rs2, 3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i8,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 33
+; CHECK-NEXT:    { // callseq 34, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, 2, 3, %rs4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i8,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 34
+; CHECK-NEXT:    { // callseq 35, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, 2, %rs3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i8,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 35
+; CHECK-NEXT:    { // callseq 36, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, %rs2, 3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i8,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 36
+; CHECK-NEXT:    { // callseq 37, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {%rs1, 2, 3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i8,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 37
+; CHECK-NEXT:    ret;
   call void @call_v4_i8(%struct.char4 { i8 1, i8 2, i8 3, i8 4 })
 
   %struct.irrr0 = insertvalue %struct.char4 poison, i8 1, 0
@@ -259,23 +636,152 @@ define void @st_param_v4_i8(i8 %a, i8 %b, i8 %c, i8 %d) {
   ret void
 }
 
-; CHECK-LABEL: st_param_v4_i16
-; CHECK: st.param.v4.b16 [param0+0], {1, 2, 3, 4}
-; CHECK: st.param.v4.b16 [param0+0], {1, {{%rs[0-9]+}}, {{%rs[0-9]+}}, {{%rs[0-9]+}}}
-; CHECK: st.param.v4.b16 [param0+0], {{{%rs[0-9]+}}, 2, {{%rs[0-9]+}}, {{%rs[0-9]+}}}
-; CHECK: st.param.v4.b16 [param0+0], {{{%rs[0-9]+}}, {{%rs[0-9]+}}, 3, {{%rs[0-9]+}}}
-; CHECK: st.param.v4.b16 [param0+0], {{{%rs[0-9]+}}, {{%rs[0-9]+}}, {{%rs[0-9]+}}, 4}
-; CHECK: st.param.v4.b16 [param0+0], {1, 2, {{%rs[0-9]+}}, {{%rs[0-9]+}}}
-; CHECK: st.param.v4.b16 [param0+0], {1, {{%rs[0-9]+}}, 3, {{%rs[0-9]+}}}
-; CHECK: st.param.v4.b16 [param0+0], {1, {{%rs[0-9]+}}, {{%rs[0-9]+}}, 4}
-; CHECK: st.param.v4.b16 [param0+0], {{{%rs[0-9]+}}, 2, 3, {{%rs[0-9]+}}}
-; CHECK: st.param.v4.b16 [param0+0], {{{%rs[0-9]+}}, 2, {{%rs[0-9]+}}, 4}
-; CHECK: st.param.v4.b16 [param0+0], {{{%rs[0-9]+}}, {{%rs[0-9]+}}, 3, 4}
-; CHECK: st.param.v4.b16 [param0+0], {1, 2, 3, {{%rs[0-9]+}}}
-; CHECK: st.param.v4.b16 [param0+0], {1, 2, {{%rs[0-9]+}}, 4}
-; CHECK: st.param.v4.b16 [param0+0], {1, {{%rs[0-9]+}}, 3, 4}
-; CHECK: st.param.v4.b16 [param0+0], {{{%rs[0-9]+}}, 2, 3, 4}
 define void @st_param_v4_i16(i16 %a, i16 %b, i16 %c, i16 %d) {
+; CHECK-LABEL: st_param_v4_i16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_param_0];
+; CHECK-NEXT:    { // callseq 38, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, 2, 3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i16,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 38
+; CHECK-NEXT:    ld.param.u16 %rs2, [st_param_v4_i16_param_1];
+; CHECK-NEXT:    ld.param.u16 %rs3, [st_param_v4_i16_param_2];
+; CHECK-NEXT:    ld.param.u16 %rs4, [st_param_v4_i16_param_3];
+; CHECK-NEXT:    { // callseq 39, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, %rs2, %rs3, %rs4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i16,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 39
+; CHECK-NEXT:    { // callseq 40, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {%rs1, 2, %rs3, %rs4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i16,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 40
+; CHECK-NEXT:    { // callseq 41, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {%rs1, %rs2, 3, %rs4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i16,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 41
+; CHECK-NEXT:    { // callseq 42, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {%rs1, %rs2, %rs3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i16,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 42
+; CHECK-NEXT:    { // callseq 43, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, 2, %rs3, %rs4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i16,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 43
+; CHECK-NEXT:    { // callseq 44, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, %rs2, 3, %rs4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i16,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 44
+; CHECK-NEXT:    { // callseq 45, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, %rs2, %rs3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i16,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 45
+; CHECK-NEXT:    { // callseq 46, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {%rs1, 2, 3, %rs4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i16,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 46
+; CHECK-NEXT:    { // callseq 47, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {%rs1, 2, %rs3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i16,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 47
+; CHECK-NEXT:    { // callseq 48, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {%rs1, %rs2, 3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i16,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 48
+; CHECK-NEXT:    { // callseq 49, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, 2, 3, %rs4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i16,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 49
+; CHECK-NEXT:    { // callseq 50, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, 2, %rs3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i16,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 50
+; CHECK-NEXT:    { // callseq 51, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, %rs2, 3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i16,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 51
+; CHECK-NEXT:    { // callseq 52, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {%rs1, 2, 3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i16,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 52
+; CHECK-NEXT:    ret;
   call void @call_v4_i16(%struct.short4 { i16 1, i16 2, i16 3, i16 4 })
 
   %struct.irrr0 = insertvalue %struct.short4 poison, i16 1, 0
@@ -364,23 +870,152 @@ define void @st_param_v4_i16(i16 %a, i16 %b, i16 %c, i16 %d) {
   ret void
 }
 
-; CHECK-LABEL: st_param_v4_i32
-; CHECK: st.param.v4.b32 [param0+0], {1, 2, 3, 4}
-; CHECK: st.param.v4.b32 [param0+0], {1, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}
-; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, 2, {{%r[0-9]+}}, {{%r[0-9]+}}}
-; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, 3, {{%r[0-9]+}}}
-; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, 4}
-; CHECK: st.param.v4.b32 [param0+0], {1, 2, {{%r[0-9]+}}, {{%r[0-9]+}}}
-; CHECK: st.param.v4.b32 [param0+0], {1, {{%r[0-9]+}}, 3, {{%r[0-9]+}}}
-; CHECK: st.param.v4.b32 [param0+0], {1, {{%r[0-9]+}}, {{%r[0-9]+}}, 4}
-; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, 2, 3, {{%r[0-9]+}}}
-; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, 2, {{%r[0-9]+}}, 4}
-; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, 3, 4}
-; CHECK: st.param.v4.b32 [param0+0], {1, 2, 3, {{%r[0-9]+}}}
-; CHECK: st.param.v4.b32 [param0+0], {1, 2, {{%r[0-9]+}}, 4}
-; CHECK: st.param.v4.b32 [param0+0], {1, {{%r[0-9]+}}, 3, 4}
-; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, 2, 3, 4}
 define void @st_param_v4_i32(i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: st_param_v4_i32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_param_0];
+; CHECK-NEXT:    { // callseq 53, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, 2, 3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 53
+; CHECK-NEXT:    ld.param.u32 %r2, [st_param_v4_i32_param_1];
+; CHECK-NEXT:    ld.param.u32 %r3, [st_param_v4_i32_param_2];
+; CHECK-NEXT:    ld.param.u32 %r4, [st_param_v4_i32_param_3];
+; CHECK-NEXT:    { // callseq 54, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, %r2, %r3, %r4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 54
+; CHECK-NEXT:    { // callseq 55, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {%r1, 2, %r3, %r4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 55
+; CHECK-NEXT:    { // callseq 56, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {%r1, %r2, 3, %r4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 56
+; CHECK-NEXT:    { // callseq 57, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {%r1, %r2, %r3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 57
+; CHECK-NEXT:    { // callseq 58, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, 2, %r3, %r4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 58
+; CHECK-NEXT:    { // callseq 59, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, %r2, 3, %r4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 59
+; CHECK-NEXT:    { // callseq 60, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, %r2, %r3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 60
+; CHECK-NEXT:    { // callseq 61, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {%r1, 2, 3, %r4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 61
+; CHECK-NEXT:    { // callseq 62, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {%r1, 2, %r3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 62
+; CHECK-NEXT:    { // callseq 63, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {%r1, %r2, 3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 63
+; CHECK-NEXT:    { // callseq 64, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, 2, 3, %r4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 64
+; CHECK-NEXT:    { // callseq 65, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, 2, %r3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 65
+; CHECK-NEXT:    { // callseq 66, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, %r2, 3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 66
+; CHECK-NEXT:    { // callseq 67, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {%r1, 2, 3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 67
+; CHECK-NEXT:    ret;
   call void @call_v4_i32(%struct.int4 { i32 1, i32 2, i32 3, i32 4 })
 
   %struct.irrr0 = insertvalue %struct.int4 poison, i32 1, 0
@@ -469,23 +1104,152 @@ define void @st_param_v4_i32(i32 %a, i32 %b, i32 %c, i32 %d) {
   ret void
 }
 
-; CHECK-LABEL: st_param_v4_f32
-; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, 0f40400000, 0f40800000}
-; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, {{%f[0-9]+}}, {{%f[0-9]+}}, {{%f[0-9]+}}}
-; CHECK: st.param.v4.f32 [param0+0], {{{%f[0-9]+}}, 0f40000000, {{%f[0-9]+}}, {{%f[0-9]+}}}
-; CHECK: st.param.v4.f32 [param0+0], {{{%f[0-9]+}}, {{%f[0-9]+}}, 0f40400000, {{%f[0-9]+}}}
-; CHECK: st.param.v4.f32 [param0+0], {{{%f[0-9]+}}, {{%f[0-9]+}}, {{%f[0-9]+}}, 0f40800000}
-; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, {{%f[0-9]+}}, {{%f[0-9]+}}}
-; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, {{%f[0-9]+}}, 0f40400000, {{%f[0-9]+}}}
-; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, {{%f[0-9]+}}, {{%f[0-9]+}}, 0f40800000}
-; CHECK: st.param.v4.f32 [param0+0], {{{%f[0-9]+}}, 0f40000000, 0f40400000, {{%f[0-9]+}}}
-; CHECK: st.param.v4.f32 [param0+0], {{{%f[0-9]+}}, 0f40000000, {{%f[0-9]+}}, 0f40800000}
-; CHECK: st.param.v4.f32 [param0+0], {{{%f[0-9]+}}, {{%f[0-9]+}}, 0f40400000, 0f40800000}
-; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, 0f40400000, {{%f[0-9]+}}}
-; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, {{%f[0-9]+}}, 0f40800000}
-; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, {{%f[0-9]+}}, 0f40400000, 0f40800000}
-; CHECK: st.param.v4.f32 [param0+0], {{{%f[0-9]+}}, 0f40000000, 0f40400000, 0f40800000}
 define void @st_param_v4_f32(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: st_param_v4_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_param_0];
+; CHECK-NEXT:    { // callseq 68, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, 0f40400000, 0f40800000};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_f32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 68
+; CHECK-NEXT:    ld.param.f32 %f2, [st_param_v4_f32_param_1];
+; CHECK-NEXT:    ld.param.f32 %f3, [st_param_v4_f32_param_2];
+; CHECK-NEXT:    ld.param.f32 %f4, [st_param_v4_f32_param_3];
+; CHECK-NEXT:    { // callseq 69, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, %f2, %f3, %f4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_f32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 69
+; CHECK-NEXT:    { // callseq 70, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {%f1, 0f40000000, %f3, %f4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_f32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 70
+; CHECK-NEXT:    { // callseq 71, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {%f1, %f2, 0f40400000, %f4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_f32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 71
+; CHECK-NEXT:    { // callseq 72, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {%f1, %f2, %f3, 0f40800000};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_f32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 72
+; CHECK-NEXT:    { // callseq 73, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, %f3, %f4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_f32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 73
+; CHECK-NEXT:    { // callseq 74, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, %f2, 0f40400000, %f4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_f32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 74
+; CHECK-NEXT:    { // callseq 75, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, %f2, %f3, 0f40800000};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_f32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 75
+; CHECK-NEXT:    { // callseq 76, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {%f1, 0f40000000, 0f40400000, %f4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_f32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 76
+; CHECK-NEXT:    { // callseq 77, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {%f1, 0f40000000, %f3, 0f40800000};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_f32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 77
+; CHECK-NEXT:    { // callseq 78, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {%f1, %f2, 0f40400000, 0f40800000};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_f32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 78
+; CHECK-NEXT:    { // callseq 79, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, 0f40400000, %f4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_f32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 79
+; CHECK-NEXT:    { // callseq 80, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, %f3, 0f40800000};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_f32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 80
+; CHECK-NEXT:    { // callseq 81, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, %f2, 0f40400000, 0f40800000};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_f32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 81
+; CHECK-NEXT:    { // callseq 82, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {%f1, 0f40000000, 0f40400000, 0f40800000};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_f32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 82
+; CHECK-NEXT:    ret;
   call void @call_v4_f32(%struct.float4 { float 1.0, float 2.0, float 3.0, float 4.0 })
 
   %struct.irrr0 = insertvalue %struct.float4 poison, float 1.0, 0

>From ebf9db249b860381421112946d430c341cf4c48d Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean at nvidia.com>
Date: Tue, 14 May 2024 00:08:46 +0000
Subject: [PATCH 3/4] address comments

---
 llvm/test/CodeGen/NVPTX/st-param-imm.ll | 1570 ++++++++++++++++-------
 1 file changed, 1114 insertions(+), 456 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/st-param-imm.ll b/llvm/test/CodeGen/NVPTX/st-param-imm.ll
index fd74218d3dd94..bb6b04bc2d523 100644
--- a/llvm/test/CodeGen/NVPTX/st-param-imm.ll
+++ b/llvm/test/CodeGen/NVPTX/st-param-imm.ll
@@ -125,13 +125,12 @@ declare void @call_i64(i64)
 declare void @call_f32(float)
 declare void @call_f64(double)
 
-define void @st_param_v2_i8(i8 %val) {
-; CHECK-LABEL: st_param_v2_i8(
+define void @st_param_v2_i8_ii() {
+; CHECK-LABEL: st_param_v2_i8_ii(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v2_i8_param_0];
 ; CHECK-NEXT:    { // callseq 5, 0
 ; CHECK-NEXT:    .param .align 2 .b8 param0[2];
 ; CHECK-NEXT:    st.param.v2.b8 [param0+0], {1, 2};
@@ -141,6 +140,17 @@ define void @st_param_v2_i8(i8 %val) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 5
+; CHECK-NEXT:    ret;
+  call void @call_v2_i8(%struct.char2 { i8 1, i8 2 })
+  ret void
+}
+define void @st_param_v2_i8_ir(i8 %val) {
+; CHECK-LABEL: st_param_v2_i8_ir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v2_i8_ir_param_0];
 ; CHECK-NEXT:    { // callseq 6, 0
 ; CHECK-NEXT:    .param .align 2 .b8 param0[2];
 ; CHECK-NEXT:    st.param.v2.b8 [param0+0], {1, %rs1};
@@ -150,6 +160,19 @@ define void @st_param_v2_i8(i8 %val) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 6
+; CHECK-NEXT:    ret;
+  %struct.ir0 = insertvalue %struct.char2 poison, i8 1, 0
+  %struct.ir1 = insertvalue %struct.char2 %struct.ir0, i8 %val, 1
+  call void @call_v2_i8(%struct.char2 %struct.ir1)
+  ret void
+}
+define void @st_param_v2_i8_ri(i8 %val) {
+; CHECK-LABEL: st_param_v2_i8_ri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v2_i8_ri_param_0];
 ; CHECK-NEXT:    { // callseq 7, 0
 ; CHECK-NEXT:    .param .align 2 .b8 param0[2];
 ; CHECK-NEXT:    st.param.v2.b8 [param0+0], {%rs1, 2};
@@ -160,23 +183,18 @@ define void @st_param_v2_i8(i8 %val) {
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 7
 ; CHECK-NEXT:    ret;
-  call void @call_v2_i8(%struct.char2 { i8 1, i8 2 })
-  %struct.ir0 = insertvalue %struct.char2 poison, i8 1, 0
-  %struct.ir1 = insertvalue %struct.char2 %struct.ir0, i8 %val, 1
-  call void @call_v2_i8(%struct.char2 %struct.ir1)
   %struct.ri0 = insertvalue %struct.char2 poison, i8 %val, 0
   %struct.ri1 = insertvalue %struct.char2 %struct.ri0, i8 2, 1
   call void @call_v2_i8(%struct.char2 %struct.ri1)
   ret void
 }
 
-define void @st_param_v2_i16(i16 %val) {
-; CHECK-LABEL: st_param_v2_i16(
+define void @st_param_v2_i16_ii() {
+; CHECK-LABEL: st_param_v2_i16_ii(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v2_i16_param_0];
 ; CHECK-NEXT:    { // callseq 8, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v2.b16 [param0+0], {1, 2};
@@ -186,6 +204,17 @@ define void @st_param_v2_i16(i16 %val) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 8
+; CHECK-NEXT:    ret;
+  call void @call_v2_i16(%struct.short2 { i16 1, i16 2 })
+  ret void
+}
+define void @st_param_v2_i16_ir(i16 %val) {
+; CHECK-LABEL: st_param_v2_i16_ir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v2_i16_ir_param_0];
 ; CHECK-NEXT:    { // callseq 9, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v2.b16 [param0+0], {1, %rs1};
@@ -195,6 +224,19 @@ define void @st_param_v2_i16(i16 %val) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 9
+; CHECK-NEXT:    ret;
+  %struct.ir0 = insertvalue %struct.short2 poison, i16 1, 0
+  %struct.ir1 = insertvalue %struct.short2 %struct.ir0, i16 %val, 1
+  call void @call_v2_i16(%struct.short2 %struct.ir1)
+  ret void
+}
+define void @st_param_v2_i16_ri(i16 %val) {
+; CHECK-LABEL: st_param_v2_i16_ri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v2_i16_ri_param_0];
 ; CHECK-NEXT:    { // callseq 10, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v2.b16 [param0+0], {%rs1, 2};
@@ -205,23 +247,18 @@ define void @st_param_v2_i16(i16 %val) {
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 10
 ; CHECK-NEXT:    ret;
-  call void @call_v2_i16(%struct.short2 { i16 1, i16 2 })
-  %struct.ir0 = insertvalue %struct.short2 poison, i16 1, 0
-  %struct.ir1 = insertvalue %struct.short2 %struct.ir0, i16 %val, 1
-  call void @call_v2_i16(%struct.short2 %struct.ir1)
   %struct.ri0 = insertvalue %struct.short2 poison, i16 %val, 0
   %struct.ri1 = insertvalue %struct.short2 %struct.ri0, i16 2, 1
   call void @call_v2_i16(%struct.short2 %struct.ri1)
   ret void
 }
 
-define void @st_param_v2_i32(i32 %val) {
-; CHECK-LABEL: st_param_v2_i32(
+define void @st_param_v2_i32_ii() {
+; CHECK-LABEL: st_param_v2_i32_ii(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v2_i32_param_0];
 ; CHECK-NEXT:    { // callseq 11, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v2.b32 [param0+0], {1, 2};
@@ -231,6 +268,17 @@ define void @st_param_v2_i32(i32 %val) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 11
+; CHECK-NEXT:    ret;
+  call void @call_v2_i32(%struct.int2 { i32 1, i32 2 })
+  ret void
+}
+define void @st_param_v2_i32_ir(i32 %val) {
+; CHECK-LABEL: st_param_v2_i32_ir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v2_i32_ir_param_0];
 ; CHECK-NEXT:    { // callseq 12, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v2.b32 [param0+0], {1, %r1};
@@ -240,6 +288,19 @@ define void @st_param_v2_i32(i32 %val) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 12
+; CHECK-NEXT:    ret;
+  %struct.ir0 = insertvalue %struct.int2 poison, i32 1, 0
+  %struct.ir1 = insertvalue %struct.int2 %struct.ir0, i32 %val, 1
+  call void @call_v2_i32(%struct.int2 %struct.ir1)
+  ret void
+}
+define void @st_param_v2_i32_ri(i32 %val) {
+; CHECK-LABEL: st_param_v2_i32_ri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v2_i32_ri_param_0];
 ; CHECK-NEXT:    { // callseq 13, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v2.b32 [param0+0], {%r1, 2};
@@ -250,23 +311,18 @@ define void @st_param_v2_i32(i32 %val) {
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 13
 ; CHECK-NEXT:    ret;
-  call void @call_v2_i32(%struct.int2 { i32 1, i32 2 })
-  %struct.ir0 = insertvalue %struct.int2 poison, i32 1, 0
-  %struct.ir1 = insertvalue %struct.int2 %struct.ir0, i32 %val, 1
-  call void @call_v2_i32(%struct.int2 %struct.ir1)
   %struct.ri0 = insertvalue %struct.int2 poison, i32 %val, 0
   %struct.ri1 = insertvalue %struct.int2 %struct.ri0, i32 2, 1
   call void @call_v2_i32(%struct.int2 %struct.ri1)
   ret void
 }
 
-define void @st_param_v2_i64(i64 %val) {
-; CHECK-LABEL: st_param_v2_i64(
+define void @st_param_v2_i64_ii() {
+; CHECK-LABEL: st_param_v2_i64_ii(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [st_param_v2_i64_param_0];
 ; CHECK-NEXT:    { // callseq 14, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v2.b64 [param0+0], {1, 2};
@@ -276,6 +332,17 @@ define void @st_param_v2_i64(i64 %val) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 14
+; CHECK-NEXT:    ret;
+  call void @call_v2_i64(%struct.longlong2 { i64 1, i64 2 })
+  ret void
+}
+define void @st_param_v2_i64_ir(i64 %val) {
+; CHECK-LABEL: st_param_v2_i64_ir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [st_param_v2_i64_ir_param_0];
 ; CHECK-NEXT:    { // callseq 15, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v2.b64 [param0+0], {1, %rd1};
@@ -285,6 +352,19 @@ define void @st_param_v2_i64(i64 %val) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 15
+; CHECK-NEXT:    ret;
+  %struct.ir0 = insertvalue %struct.longlong2 poison, i64 1, 0
+  %struct.ir1 = insertvalue %struct.longlong2 %struct.ir0, i64 %val, 1
+  call void @call_v2_i64(%struct.longlong2 %struct.ir1)
+  ret void
+}
+define void @st_param_v2_i64_ri(i64 %val) {
+; CHECK-LABEL: st_param_v2_i64_ri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [st_param_v2_i64_ri_param_0];
 ; CHECK-NEXT:    { // callseq 16, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v2.b64 [param0+0], {%rd1, 2};
@@ -295,23 +375,18 @@ define void @st_param_v2_i64(i64 %val) {
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 16
 ; CHECK-NEXT:    ret;
-  call void @call_v2_i64(%struct.longlong2 { i64 1, i64 2 })
-  %struct.ir0 = insertvalue %struct.longlong2 poison, i64 1, 0
-  %struct.ir1 = insertvalue %struct.longlong2 %struct.ir0, i64 %val, 1
-  call void @call_v2_i64(%struct.longlong2 %struct.ir1)
   %struct.ri0 = insertvalue %struct.longlong2 poison, i64 %val, 0
   %struct.ri1 = insertvalue %struct.longlong2 %struct.ri0, i64 2, 1
   call void @call_v2_i64(%struct.longlong2 %struct.ri1)
   ret void
 }
 
-define void @st_param_v2_f32(float %val) {
-; CHECK-LABEL: st_param_v2_f32(
+define void @st_param_v2_f32_ii(float %val) {
+; CHECK-LABEL: st_param_v2_f32_ii(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v2_f32_param_0];
 ; CHECK-NEXT:    { // callseq 17, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v2.f32 [param0+0], {0f3F800000, 0f40000000};
@@ -321,6 +396,17 @@ define void @st_param_v2_f32(float %val) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 17
+; CHECK-NEXT:    ret;
+  call void @call_v2_f32(%struct.float2 { float 1.0, float 2.0 })
+  ret void
+}
+define void @st_param_v2_f32_ir(float %val) {
+; CHECK-LABEL: st_param_v2_f32_ir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v2_f32_ir_param_0];
 ; CHECK-NEXT:    { // callseq 18, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v2.f32 [param0+0], {0f3F800000, %f1};
@@ -330,6 +416,19 @@ define void @st_param_v2_f32(float %val) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 18
+; CHECK-NEXT:    ret;
+  %struct.ir0 = insertvalue %struct.float2 poison, float 1.0, 0
+  %struct.ir1 = insertvalue %struct.float2 %struct.ir0, float %val, 1
+  call void @call_v2_f32(%struct.float2 %struct.ir1)
+  ret void
+}
+define void @st_param_v2_f32_ri(float %val) {
+; CHECK-LABEL: st_param_v2_f32_ri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v2_f32_ri_param_0];
 ; CHECK-NEXT:    { // callseq 19, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v2.f32 [param0+0], {%f1, 0f40000000};
@@ -340,23 +439,18 @@ define void @st_param_v2_f32(float %val) {
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 19
 ; CHECK-NEXT:    ret;
-  call void @call_v2_f32(%struct.float2 { float 1.0, float 2.0 })
-  %struct.ir0 = insertvalue %struct.float2 poison, float 1.0, 0
-  %struct.ir1 = insertvalue %struct.float2 %struct.ir0, float %val, 1
-  call void @call_v2_f32(%struct.float2 %struct.ir1)
   %struct.ri0 = insertvalue %struct.float2 poison, float %val, 0
   %struct.ri1 = insertvalue %struct.float2 %struct.ri0, float 2.0, 1
   call void @call_v2_f32(%struct.float2 %struct.ri1)
   ret void
 }
 
-define void @st_param_v2_f64(double %val) {
-; CHECK-LABEL: st_param_v2_f64(
+define void @st_param_v2_f64_ii(double %val) {
+; CHECK-LABEL: st_param_v2_f64_ii(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .f64 %fd<2>;
+; CHECK-EMPTY:
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [st_param_v2_f64_param_0];
 ; CHECK-NEXT:    { // callseq 20, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v2.f64 [param0+0], {0d3FF0000000000000, 0d4000000000000000};
@@ -366,6 +460,17 @@ define void @st_param_v2_f64(double %val) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 20
+; CHECK-NEXT:    ret;
+  call void @call_v2_f64(%struct.double2 { double 1.0, double 2.0 })
+  ret void
+}
+define void @st_param_v2_f64_ir(double %val) {
+; CHECK-LABEL: st_param_v2_f64_ir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [st_param_v2_f64_ir_param_0];
 ; CHECK-NEXT:    { // callseq 21, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v2.f64 [param0+0], {0d3FF0000000000000, %fd1};
@@ -375,6 +480,19 @@ define void @st_param_v2_f64(double %val) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 21
+; CHECK-NEXT:    ret;
+  %struct.ir0 = insertvalue %struct.double2 poison, double 1.0, 0
+  %struct.ir1 = insertvalue %struct.double2 %struct.ir0, double %val, 1
+  call void @call_v2_f64(%struct.double2 %struct.ir1)
+  ret void
+}
+define void @st_param_v2_f64_ri(double %val) {
+; CHECK-LABEL: st_param_v2_f64_ri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [st_param_v2_f64_ri_param_0];
 ; CHECK-NEXT:    { // callseq 22, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v2.f64 [param0+0], {%fd1, 0d4000000000000000};
@@ -385,10 +503,6 @@ define void @st_param_v2_f64(double %val) {
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 22
 ; CHECK-NEXT:    ret;
-  call void @call_v2_f64(%struct.double2 { double 1.0, double 2.0 })
-  %struct.ir0 = insertvalue %struct.double2 poison, double 1.0, 0
-  %struct.ir1 = insertvalue %struct.double2 %struct.ir0, double %val, 1
-  call void @call_v2_f64(%struct.double2 %struct.ir1)
   %struct.ri0 = insertvalue %struct.double2 poison, double %val, 0
   %struct.ri1 = insertvalue %struct.double2 %struct.ri0, double 2.0, 1
   call void @call_v2_f64(%struct.double2 %struct.ri1)
@@ -402,13 +516,12 @@ declare void @call_v2_i64(%struct.longlong2)
 declare void @call_v2_f32(%struct.float2)
 declare void @call_v2_f64(%struct.double2)
 
-define void @st_param_v4_i8(i8 %a, i8 %b, i8 %c, i8 %d) {
-; CHECK-LABEL: st_param_v4_i8(
+define void @st_param_v4_i8_iiii() {
+; CHECK-LABEL: st_param_v4_i8_iiii(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_param_0];
 ; CHECK-NEXT:    { // callseq 23, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, 2, 3, 4};
@@ -418,36 +531,97 @@ define void @st_param_v4_i8(i8 %a, i8 %b, i8 %c, i8 %d) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 23
-; CHECK-NEXT:    ld.param.u8 %rs2, [st_param_v4_i8_param_1];
-; CHECK-NEXT:    ld.param.u8 %rs3, [st_param_v4_i8_param_2];
-; CHECK-NEXT:    ld.param.u8 %rs4, [st_param_v4_i8_param_3];
+; CHECK-NEXT:    ret;
+  call void @call_v4_i8(%struct.char4 { i8 1, i8 2, i8 3, i8 4 })
+  ret void
+}
+define void @st_param_v4_i8_irrr(i8 %b, i8 %c, i8 %d) {
+; CHECK-LABEL: st_param_v4_i8_irrr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_irrr_param_0];
+; CHECK-NEXT:    ld.param.u8 %rs2, [st_param_v4_i8_irrr_param_1];
+; CHECK-NEXT:    ld.param.u8 %rs3, [st_param_v4_i8_irrr_param_2];
 ; CHECK-NEXT:    { // callseq 24, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, %rs2, %rs3, %rs4};
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, %rs1, %rs2, %rs3};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i8,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 24
+; CHECK-NEXT:    ret;
+  %struct.irrr0 = insertvalue %struct.char4 poison, i8 1, 0
+  %struct.irrr1 = insertvalue %struct.char4 %struct.irrr0, i8 %b, 1
+  %struct.irrr2 = insertvalue %struct.char4 %struct.irrr1, i8 %c, 2
+  %struct.irrr3 = insertvalue %struct.char4 %struct.irrr2, i8 %d, 3
+  call void @call_v4_i8(%struct.char4 %struct.irrr3)
+  ret void
+}
+define void @st_param_v4_i8_rirr(i8 %a, i8 %c, i8 %d) {
+; CHECK-LABEL: st_param_v4_i8_rirr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_rirr_param_0];
+; CHECK-NEXT:    ld.param.u8 %rs2, [st_param_v4_i8_rirr_param_1];
+; CHECK-NEXT:    ld.param.u8 %rs3, [st_param_v4_i8_rirr_param_2];
 ; CHECK-NEXT:    { // callseq 25, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0+0], {%rs1, 2, %rs3, %rs4};
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {%rs1, 2, %rs2, %rs3};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i8,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 25
+; CHECK-NEXT:    ret;
+  %struct.rirr0 = insertvalue %struct.char4 poison, i8 %a, 0
+  %struct.rirr1 = insertvalue %struct.char4 %struct.rirr0, i8 2, 1
+  %struct.rirr2 = insertvalue %struct.char4 %struct.rirr1, i8 %c, 2
+  %struct.rirr3 = insertvalue %struct.char4 %struct.rirr2, i8 %d, 3
+  call void @call_v4_i8(%struct.char4 %struct.rirr3)
+  ret void
+}
+define void @st_param_v4_i8_rrir(i8 %a, i8 %b, i8 %d) {
+; CHECK-LABEL: st_param_v4_i8_rrir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_rrir_param_0];
+; CHECK-NEXT:    ld.param.u8 %rs2, [st_param_v4_i8_rrir_param_1];
+; CHECK-NEXT:    ld.param.u8 %rs3, [st_param_v4_i8_rrir_param_2];
 ; CHECK-NEXT:    { // callseq 26, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0+0], {%rs1, %rs2, 3, %rs4};
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {%rs1, %rs2, 3, %rs3};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i8,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 26
+; CHECK-NEXT:    ret;
+  %struct.rrir0 = insertvalue %struct.char4 poison, i8 %a, 0
+  %struct.rrir1 = insertvalue %struct.char4 %struct.rrir0, i8 %b, 1
+  %struct.rrir2 = insertvalue %struct.char4 %struct.rrir1, i8 3, 2
+  %struct.rrir3 = insertvalue %struct.char4 %struct.rrir2, i8 %d, 3
+  call void @call_v4_i8(%struct.char4 %struct.rrir3)
+  ret void
+}
+define void @st_param_v4_i8_rrri(i8 %a, i8 %b, i8 %c) {
+; CHECK-LABEL: st_param_v4_i8_rrri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_rrri_param_0];
+; CHECK-NEXT:    ld.param.u8 %rs2, [st_param_v4_i8_rrri_param_1];
+; CHECK-NEXT:    ld.param.u8 %rs3, [st_param_v4_i8_rrri_param_2];
 ; CHECK-NEXT:    { // callseq 27, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0+0], {%rs1, %rs2, %rs3, 4};
@@ -457,51 +631,147 @@ define void @st_param_v4_i8(i8 %a, i8 %b, i8 %c, i8 %d) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 27
+; CHECK-NEXT:    ret;
+  %struct.rrri0 = insertvalue %struct.char4 poison, i8 %a, 0
+  %struct.rrri1 = insertvalue %struct.char4 %struct.rrri0, i8 %b, 1
+  %struct.rrri2 = insertvalue %struct.char4 %struct.rrri1, i8 %c, 2
+  %struct.rrri3 = insertvalue %struct.char4 %struct.rrri2, i8 4, 3
+  call void @call_v4_i8(%struct.char4 %struct.rrri3)
+  ret void
+}
+define void @st_param_v4_i8_iirr(i8 %c, i8 %d) {
+; CHECK-LABEL: st_param_v4_i8_iirr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_iirr_param_0];
+; CHECK-NEXT:    ld.param.u8 %rs2, [st_param_v4_i8_iirr_param_1];
 ; CHECK-NEXT:    { // callseq 28, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, 2, %rs3, %rs4};
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, 2, %rs1, %rs2};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i8,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 28
+; CHECK-NEXT:    ret;
+  %struct.iirr0 = insertvalue %struct.char4 poison, i8 1, 0
+  %struct.iirr1 = insertvalue %struct.char4 %struct.iirr0, i8 2, 1
+  %struct.iirr2 = insertvalue %struct.char4 %struct.iirr1, i8 %c, 2
+  %struct.iirr3 = insertvalue %struct.char4 %struct.iirr2, i8 %d, 3
+  call void @call_v4_i8(%struct.char4 %struct.iirr3)
+  ret void
+}
+define void @st_param_v4_i8_irir(i8 %b, i8 %d) {
+; CHECK-LABEL: st_param_v4_i8_irir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_irir_param_0];
+; CHECK-NEXT:    ld.param.u8 %rs2, [st_param_v4_i8_irir_param_1];
 ; CHECK-NEXT:    { // callseq 29, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, %rs2, 3, %rs4};
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, %rs1, 3, %rs2};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i8,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 29
+; CHECK-NEXT:    ret;
+  %struct.irir0 = insertvalue %struct.char4 poison, i8 1, 0
+  %struct.irir1 = insertvalue %struct.char4 %struct.irir0, i8 %b, 1
+  %struct.irir2 = insertvalue %struct.char4 %struct.irir1, i8 3, 2
+  %struct.irir3 = insertvalue %struct.char4 %struct.irir2, i8 %d, 3
+  call void @call_v4_i8(%struct.char4 %struct.irir3)
+  ret void
+}
+define void @st_param_v4_i8_irri(i8 %b, i8 %c) {
+; CHECK-LABEL: st_param_v4_i8_irri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_irri_param_0];
+; CHECK-NEXT:    ld.param.u8 %rs2, [st_param_v4_i8_irri_param_1];
 ; CHECK-NEXT:    { // callseq 30, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, %rs2, %rs3, 4};
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, %rs1, %rs2, 4};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i8,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 30
+; CHECK-NEXT:    ret;
+  %struct.irri0 = insertvalue %struct.char4 poison, i8 1, 0
+  %struct.irri1 = insertvalue %struct.char4 %struct.irri0, i8 %b, 1
+  %struct.irri2 = insertvalue %struct.char4 %struct.irri1, i8 %c, 2
+  %struct.irri3 = insertvalue %struct.char4 %struct.irri2, i8 4, 3
+  call void @call_v4_i8(%struct.char4 %struct.irri3)
+  ret void
+}
+define void @st_param_v4_i8_riir(i8 %a, i8 %d) {
+; CHECK-LABEL: st_param_v4_i8_riir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_riir_param_0];
+; CHECK-NEXT:    ld.param.u8 %rs2, [st_param_v4_i8_riir_param_1];
 ; CHECK-NEXT:    { // callseq 31, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0+0], {%rs1, 2, 3, %rs4};
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {%rs1, 2, 3, %rs2};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i8,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 31
+; CHECK-NEXT:    ret;
+  %struct.riir0 = insertvalue %struct.char4 poison, i8 %a, 0
+  %struct.riir1 = insertvalue %struct.char4 %struct.riir0, i8 2, 1
+  %struct.riir2 = insertvalue %struct.char4 %struct.riir1, i8 3, 2
+  %struct.riir3 = insertvalue %struct.char4 %struct.riir2, i8 %d, 3
+  call void @call_v4_i8(%struct.char4 %struct.riir3)
+  ret void
+}
+define void @st_param_v4_i8_riri(i8 %a, i8 %c) {
+; CHECK-LABEL: st_param_v4_i8_riri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_riri_param_0];
+; CHECK-NEXT:    ld.param.u8 %rs2, [st_param_v4_i8_riri_param_1];
 ; CHECK-NEXT:    { // callseq 32, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0+0], {%rs1, 2, %rs3, 4};
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {%rs1, 2, %rs2, 4};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i8,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 32
+; CHECK-NEXT:    ret;
+  %struct.riri0 = insertvalue %struct.char4 poison, i8 %a, 0
+  %struct.riri1 = insertvalue %struct.char4 %struct.riri0, i8 2, 1
+  %struct.riri2 = insertvalue %struct.char4 %struct.riri1, i8 %c, 2
+  %struct.riri3 = insertvalue %struct.char4 %struct.riri2, i8 4, 3
+  call void @call_v4_i8(%struct.char4 %struct.riri3)
+  ret void
+}
+define void @st_param_v4_i8_rrii(i8 %a, i8 %b) {
+; CHECK-LABEL: st_param_v4_i8_rrii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_rrii_param_0];
+; CHECK-NEXT:    ld.param.u8 %rs2, [st_param_v4_i8_rrii_param_1];
 ; CHECK-NEXT:    { // callseq 33, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0+0], {%rs1, %rs2, 3, 4};
@@ -511,33 +781,93 @@ define void @st_param_v4_i8(i8 %a, i8 %b, i8 %c, i8 %d) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 33
-; CHECK-NEXT:    { // callseq 34, 0
-; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, 2, 3, %rs4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i8,
-; CHECK-NEXT:    (
+; CHECK-NEXT:    ret;
+  %struct.rrii0 = insertvalue %struct.char4 poison, i8 %a, 0
+  %struct.rrii1 = insertvalue %struct.char4 %struct.rrii0, i8 %b, 1
+  %struct.rrii2 = insertvalue %struct.char4 %struct.rrii1, i8 3, 2
+  %struct.rrii3 = insertvalue %struct.char4 %struct.rrii2, i8 4, 3
+  call void @call_v4_i8(%struct.char4 %struct.rrii3)
+  ret void
+}
+define void @st_param_v4_i8_iiir(i8 %d) {
+; CHECK-LABEL: st_param_v4_i8_iiir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_iiir_param_0];
+; CHECK-NEXT:    { // callseq 34, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, 2, 3, %rs1};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i8,
+; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 34
+; CHECK-NEXT:    ret;
+  %struct.iiir0 = insertvalue %struct.char4 poison, i8 1, 0
+  %struct.iiir1 = insertvalue %struct.char4 %struct.iiir0, i8 2, 1
+  %struct.iiir2 = insertvalue %struct.char4 %struct.iiir1, i8 3, 2
+  %struct.iiir3 = insertvalue %struct.char4 %struct.iiir2, i8 %d, 3
+  call void @call_v4_i8(%struct.char4 %struct.iiir3)
+  ret void
+}
+define void @st_param_v4_i8_iiri(i8 %c) {
+; CHECK-LABEL: st_param_v4_i8_iiri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_iiri_param_0];
 ; CHECK-NEXT:    { // callseq 35, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, 2, %rs3, 4};
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, 2, %rs1, 4};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i8,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 35
+; CHECK-NEXT:    ret;
+  %struct.iiri0 = insertvalue %struct.char4 poison, i8 1, 0
+  %struct.iiri1 = insertvalue %struct.char4 %struct.iiri0, i8 2, 1
+  %struct.iiri2 = insertvalue %struct.char4 %struct.iiri1, i8 %c, 2
+  %struct.iiri3 = insertvalue %struct.char4 %struct.iiri2, i8 4, 3
+  call void @call_v4_i8(%struct.char4 %struct.iiri3)
+  ret void
+}
+define void @st_param_v4_i8_irii(i8 %b) {
+; CHECK-LABEL: st_param_v4_i8_irii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_irii_param_0];
 ; CHECK-NEXT:    { // callseq 36, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, %rs2, 3, 4};
+; CHECK-NEXT:    st.param.v4.b8 [param0+0], {1, %rs1, 3, 4};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i8,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 36
+; CHECK-NEXT:    ret;
+  %struct.irii0 = insertvalue %struct.char4 poison, i8 1, 0
+  %struct.irii1 = insertvalue %struct.char4 %struct.irii0, i8 %b, 1
+  %struct.irii2 = insertvalue %struct.char4 %struct.irii1, i8 3, 2
+  %struct.irii3 = insertvalue %struct.char4 %struct.irii2, i8 4, 3
+  call void @call_v4_i8(%struct.char4 %struct.irii3)
+  ret void
+}
+define void @st_param_v4_i8_riii(i8 %a) {
+; CHECK-LABEL: st_param_v4_i8_riii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_riii_param_0];
 ; CHECK-NEXT:    { // callseq 37, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0+0], {%rs1, 2, 3, 4};
@@ -548,86 +878,6 @@ define void @st_param_v4_i8(i8 %a, i8 %b, i8 %c, i8 %d) {
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 37
 ; CHECK-NEXT:    ret;
-  call void @call_v4_i8(%struct.char4 { i8 1, i8 2, i8 3, i8 4 })
-
-  %struct.irrr0 = insertvalue %struct.char4 poison, i8 1, 0
-  %struct.irrr1 = insertvalue %struct.char4 %struct.irrr0, i8 %b, 1
-  %struct.irrr2 = insertvalue %struct.char4 %struct.irrr1, i8 %c, 2
-  %struct.irrr3 = insertvalue %struct.char4 %struct.irrr2, i8 %d, 3
-  call void @call_v4_i8(%struct.char4 %struct.irrr3)
-
-  %struct.rirr0 = insertvalue %struct.char4 poison, i8 %a, 0
-  %struct.rirr1 = insertvalue %struct.char4 %struct.rirr0, i8 2, 1
-  %struct.rirr2 = insertvalue %struct.char4 %struct.rirr1, i8 %c, 2
-  %struct.rirr3 = insertvalue %struct.char4 %struct.rirr2, i8 %d, 3
-  call void @call_v4_i8(%struct.char4 %struct.rirr3)
-
-  %struct.rrir0 = insertvalue %struct.char4 poison, i8 %a, 0
-  %struct.rrir1 = insertvalue %struct.char4 %struct.rrir0, i8 %b, 1
-  %struct.rrir2 = insertvalue %struct.char4 %struct.rrir1, i8 3, 2
-  %struct.rrir3 = insertvalue %struct.char4 %struct.rrir2, i8 %d, 3
-  call void @call_v4_i8(%struct.char4 %struct.rrir3)
-
-  %struct.rrri0 = insertvalue %struct.char4 poison, i8 %a, 0
-  %struct.rrri1 = insertvalue %struct.char4 %struct.rrri0, i8 %b, 1
-  %struct.rrri2 = insertvalue %struct.char4 %struct.rrri1, i8 %c, 2
-  %struct.rrri3 = insertvalue %struct.char4 %struct.rrri2, i8 4, 3
-  call void @call_v4_i8(%struct.char4 %struct.rrri3)
-
-  %struct.iirr0 = insertvalue %struct.char4 poison, i8 1, 0
-  %struct.iirr1 = insertvalue %struct.char4 %struct.iirr0, i8 2, 1
-  %struct.iirr2 = insertvalue %struct.char4 %struct.iirr1, i8 %c, 2
-  %struct.iirr3 = insertvalue %struct.char4 %struct.iirr2, i8 %d, 3
-  call void @call_v4_i8(%struct.char4 %struct.iirr3)
-
-  %struct.irir0 = insertvalue %struct.char4 poison, i8 1, 0
-  %struct.irir1 = insertvalue %struct.char4 %struct.irir0, i8 %b, 1
-  %struct.irir2 = insertvalue %struct.char4 %struct.irir1, i8 3, 2
-  %struct.irir3 = insertvalue %struct.char4 %struct.irir2, i8 %d, 3
-  call void @call_v4_i8(%struct.char4 %struct.irir3)
-
-  %struct.irri0 = insertvalue %struct.char4 poison, i8 1, 0
-  %struct.irri1 = insertvalue %struct.char4 %struct.irri0, i8 %b, 1
-  %struct.irri2 = insertvalue %struct.char4 %struct.irri1, i8 %c, 2
-  %struct.irri3 = insertvalue %struct.char4 %struct.irri2, i8 4, 3
-  call void @call_v4_i8(%struct.char4 %struct.irri3)
-
-  %struct.riir0 = insertvalue %struct.char4 poison, i8 %a, 0
-  %struct.riir1 = insertvalue %struct.char4 %struct.riir0, i8 2, 1
-  %struct.riir2 = insertvalue %struct.char4 %struct.riir1, i8 3, 2
-  %struct.riir3 = insertvalue %struct.char4 %struct.riir2, i8 %d, 3
-  call void @call_v4_i8(%struct.char4 %struct.riir3)
-
-  %struct.riri0 = insertvalue %struct.char4 poison, i8 %a, 0
-  %struct.riri1 = insertvalue %struct.char4 %struct.riri0, i8 2, 1
-  %struct.riri2 = insertvalue %struct.char4 %struct.riri1, i8 %c, 2
-  %struct.riri3 = insertvalue %struct.char4 %struct.riri2, i8 4, 3
-  call void @call_v4_i8(%struct.char4 %struct.riri3)
-
-  %struct.rrii0 = insertvalue %struct.char4 poison, i8 %a, 0
-  %struct.rrii1 = insertvalue %struct.char4 %struct.rrii0, i8 %b, 1
-  %struct.rrii2 = insertvalue %struct.char4 %struct.rrii1, i8 3, 2
-  %struct.rrii3 = insertvalue %struct.char4 %struct.rrii2, i8 4, 3
-  call void @call_v4_i8(%struct.char4 %struct.rrii3)
-
-  %struct.iiir0 = insertvalue %struct.char4 poison, i8 1, 0
-  %struct.iiir1 = insertvalue %struct.char4 %struct.iiir0, i8 2, 1
-  %struct.iiir2 = insertvalue %struct.char4 %struct.iiir1, i8 3, 2
-  %struct.iiir3 = insertvalue %struct.char4 %struct.iiir2, i8 %d, 3
-  call void @call_v4_i8(%struct.char4 %struct.iiir3)
-
-  %struct.iiri0 = insertvalue %struct.char4 poison, i8 1, 0
-  %struct.iiri1 = insertvalue %struct.char4 %struct.iiri0, i8 2, 1
-  %struct.iiri2 = insertvalue %struct.char4 %struct.iiri1, i8 %c, 2
-  %struct.iiri3 = insertvalue %struct.char4 %struct.iiri2, i8 4, 3
-  call void @call_v4_i8(%struct.char4 %struct.iiri3)
-
-  %struct.irii0 = insertvalue %struct.char4 poison, i8 1, 0
-  %struct.irii1 = insertvalue %struct.char4 %struct.irii0, i8 %b, 1
-  %struct.irii2 = insertvalue %struct.char4 %struct.irii1, i8 3, 2
-  %struct.irii3 = insertvalue %struct.char4 %struct.irii2, i8 4, 3
-  call void @call_v4_i8(%struct.char4 %struct.irii3)
-
   %struct.riii0 = insertvalue %struct.char4 poison, i8 %a, 0
   %struct.riii1 = insertvalue %struct.char4 %struct.riii0, i8 2, 1
   %struct.riii2 = insertvalue %struct.char4 %struct.riii1, i8 3, 2
@@ -636,13 +886,12 @@ define void @st_param_v4_i8(i8 %a, i8 %b, i8 %c, i8 %d) {
   ret void
 }
 
-define void @st_param_v4_i16(i16 %a, i16 %b, i16 %c, i16 %d) {
-; CHECK-LABEL: st_param_v4_i16(
+define void @st_param_v4_i16_iiii() {
+; CHECK-LABEL: st_param_v4_i16_iiii(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_param_0];
 ; CHECK-NEXT:    { // callseq 38, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, 2, 3, 4};
@@ -652,36 +901,97 @@ define void @st_param_v4_i16(i16 %a, i16 %b, i16 %c, i16 %d) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 38
-; CHECK-NEXT:    ld.param.u16 %rs2, [st_param_v4_i16_param_1];
-; CHECK-NEXT:    ld.param.u16 %rs3, [st_param_v4_i16_param_2];
-; CHECK-NEXT:    ld.param.u16 %rs4, [st_param_v4_i16_param_3];
+; CHECK-NEXT:    ret;
+  call void @call_v4_i16(%struct.short4 { i16 1, i16 2, i16 3, i16 4 })
+  ret void
+}
+define void @st_param_v4_i16_irrr(i16 %b, i16 %c, i16 %d) {
+; CHECK-LABEL: st_param_v4_i16_irrr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_irrr_param_0];
+; CHECK-NEXT:    ld.param.u16 %rs2, [st_param_v4_i16_irrr_param_1];
+; CHECK-NEXT:    ld.param.u16 %rs3, [st_param_v4_i16_irrr_param_2];
 ; CHECK-NEXT:    { // callseq 39, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, %rs2, %rs3, %rs4};
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, %rs1, %rs2, %rs3};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i16,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 39
+; CHECK-NEXT:    ret;
+  %struct.irrr0 = insertvalue %struct.short4 poison, i16 1, 0
+  %struct.irrr1 = insertvalue %struct.short4 %struct.irrr0, i16 %b, 1
+  %struct.irrr2 = insertvalue %struct.short4 %struct.irrr1, i16 %c, 2
+  %struct.irrr3 = insertvalue %struct.short4 %struct.irrr2, i16 %d, 3
+  call void @call_v4_i16(%struct.short4 %struct.irrr3)
+  ret void
+}
+define void @st_param_v4_i16_rirr(i16 %a, i16 %c, i16 %d) {
+; CHECK-LABEL: st_param_v4_i16_rirr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_rirr_param_0];
+; CHECK-NEXT:    ld.param.u16 %rs2, [st_param_v4_i16_rirr_param_1];
+; CHECK-NEXT:    ld.param.u16 %rs3, [st_param_v4_i16_rirr_param_2];
 ; CHECK-NEXT:    { // callseq 40, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v4.b16 [param0+0], {%rs1, 2, %rs3, %rs4};
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {%rs1, 2, %rs2, %rs3};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i16,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 40
+; CHECK-NEXT:    ret;
+  %struct.rirr0 = insertvalue %struct.short4 poison, i16 %a, 0
+  %struct.rirr1 = insertvalue %struct.short4 %struct.rirr0, i16 2, 1
+  %struct.rirr2 = insertvalue %struct.short4 %struct.rirr1, i16 %c, 2
+  %struct.rirr3 = insertvalue %struct.short4 %struct.rirr2, i16 %d, 3
+  call void @call_v4_i16(%struct.short4 %struct.rirr3)
+  ret void
+}
+define void @st_param_v4_i16_rrir(i16 %a, i16 %b, i16 %d) {
+; CHECK-LABEL: st_param_v4_i16_rrir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_rrir_param_0];
+; CHECK-NEXT:    ld.param.u16 %rs2, [st_param_v4_i16_rrir_param_1];
+; CHECK-NEXT:    ld.param.u16 %rs3, [st_param_v4_i16_rrir_param_2];
 ; CHECK-NEXT:    { // callseq 41, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v4.b16 [param0+0], {%rs1, %rs2, 3, %rs4};
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {%rs1, %rs2, 3, %rs3};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i16,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 41
+; CHECK-NEXT:    ret;
+  %struct.rrir0 = insertvalue %struct.short4 poison, i16 %a, 0
+  %struct.rrir1 = insertvalue %struct.short4 %struct.rrir0, i16 %b, 1
+  %struct.rrir2 = insertvalue %struct.short4 %struct.rrir1, i16 3, 2
+  %struct.rrir3 = insertvalue %struct.short4 %struct.rrir2, i16 %d, 3
+  call void @call_v4_i16(%struct.short4 %struct.rrir3)
+  ret void
+}
+define void @st_param_v4_i16_rrri(i16 %a, i16 %b, i16 %c) {
+; CHECK-LABEL: st_param_v4_i16_rrri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_rrri_param_0];
+; CHECK-NEXT:    ld.param.u16 %rs2, [st_param_v4_i16_rrri_param_1];
+; CHECK-NEXT:    ld.param.u16 %rs3, [st_param_v4_i16_rrri_param_2];
 ; CHECK-NEXT:    { // callseq 42, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0+0], {%rs1, %rs2, %rs3, 4};
@@ -691,51 +1001,147 @@ define void @st_param_v4_i16(i16 %a, i16 %b, i16 %c, i16 %d) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 42
+; CHECK-NEXT:    ret;
+  %struct.rrri0 = insertvalue %struct.short4 poison, i16 %a, 0
+  %struct.rrri1 = insertvalue %struct.short4 %struct.rrri0, i16 %b, 1
+  %struct.rrri2 = insertvalue %struct.short4 %struct.rrri1, i16 %c, 2
+  %struct.rrri3 = insertvalue %struct.short4 %struct.rrri2, i16 4, 3
+  call void @call_v4_i16(%struct.short4 %struct.rrri3)
+  ret void
+}
+define void @st_param_v4_i16_iirr(i16 %c, i16 %d) {
+; CHECK-LABEL: st_param_v4_i16_iirr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_iirr_param_0];
+; CHECK-NEXT:    ld.param.u16 %rs2, [st_param_v4_i16_iirr_param_1];
 ; CHECK-NEXT:    { // callseq 43, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, 2, %rs3, %rs4};
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, 2, %rs1, %rs2};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i16,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 43
+; CHECK-NEXT:    ret;
+  %struct.iirr0 = insertvalue %struct.short4 poison, i16 1, 0
+  %struct.iirr1 = insertvalue %struct.short4 %struct.iirr0, i16 2, 1
+  %struct.iirr2 = insertvalue %struct.short4 %struct.iirr1, i16 %c, 2
+  %struct.iirr3 = insertvalue %struct.short4 %struct.iirr2, i16 %d, 3
+  call void @call_v4_i16(%struct.short4 %struct.iirr3)
+  ret void
+}
+define void @st_param_v4_i16_irir(i16 %b, i16 %d) {
+; CHECK-LABEL: st_param_v4_i16_irir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_irir_param_0];
+; CHECK-NEXT:    ld.param.u16 %rs2, [st_param_v4_i16_irir_param_1];
 ; CHECK-NEXT:    { // callseq 44, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, %rs2, 3, %rs4};
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, %rs1, 3, %rs2};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i16,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 44
+; CHECK-NEXT:    ret;
+  %struct.irir0 = insertvalue %struct.short4 poison, i16 1, 0
+  %struct.irir1 = insertvalue %struct.short4 %struct.irir0, i16 %b, 1
+  %struct.irir2 = insertvalue %struct.short4 %struct.irir1, i16 3, 2
+  %struct.irir3 = insertvalue %struct.short4 %struct.irir2, i16 %d, 3
+  call void @call_v4_i16(%struct.short4 %struct.irir3)
+  ret void
+}
+define void @st_param_v4_i16_irri(i16 %b, i16 %c) {
+; CHECK-LABEL: st_param_v4_i16_irri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_irri_param_0];
+; CHECK-NEXT:    ld.param.u16 %rs2, [st_param_v4_i16_irri_param_1];
 ; CHECK-NEXT:    { // callseq 45, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, %rs2, %rs3, 4};
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, %rs1, %rs2, 4};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i16,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 45
+; CHECK-NEXT:    ret;
+  %struct.irri0 = insertvalue %struct.short4 poison, i16 1, 0
+  %struct.irri1 = insertvalue %struct.short4 %struct.irri0, i16 %b, 1
+  %struct.irri2 = insertvalue %struct.short4 %struct.irri1, i16 %c, 2
+  %struct.irri3 = insertvalue %struct.short4 %struct.irri2, i16 4, 3
+  call void @call_v4_i16(%struct.short4 %struct.irri3)
+  ret void
+}
+define void @st_param_v4_i16_riir(i16 %a, i16 %d) {
+; CHECK-LABEL: st_param_v4_i16_riir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_riir_param_0];
+; CHECK-NEXT:    ld.param.u16 %rs2, [st_param_v4_i16_riir_param_1];
 ; CHECK-NEXT:    { // callseq 46, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v4.b16 [param0+0], {%rs1, 2, 3, %rs4};
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {%rs1, 2, 3, %rs2};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i16,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 46
+; CHECK-NEXT:    ret;
+  %struct.riir0 = insertvalue %struct.short4 poison, i16 %a, 0
+  %struct.riir1 = insertvalue %struct.short4 %struct.riir0, i16 2, 1
+  %struct.riir2 = insertvalue %struct.short4 %struct.riir1, i16 3, 2
+  %struct.riir3 = insertvalue %struct.short4 %struct.riir2, i16 %d, 3
+  call void @call_v4_i16(%struct.short4 %struct.riir3)
+  ret void
+}
+define void @st_param_v4_i16_riri(i16 %a, i16 %c) {
+; CHECK-LABEL: st_param_v4_i16_riri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_riri_param_0];
+; CHECK-NEXT:    ld.param.u16 %rs2, [st_param_v4_i16_riri_param_1];
 ; CHECK-NEXT:    { // callseq 47, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v4.b16 [param0+0], {%rs1, 2, %rs3, 4};
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {%rs1, 2, %rs2, 4};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i16,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 47
+; CHECK-NEXT:    ret;
+  %struct.riri0 = insertvalue %struct.short4 poison, i16 %a, 0
+  %struct.riri1 = insertvalue %struct.short4 %struct.riri0, i16 2, 1
+  %struct.riri2 = insertvalue %struct.short4 %struct.riri1, i16 %c, 2
+  %struct.riri3 = insertvalue %struct.short4 %struct.riri2, i16 4, 3
+  call void @call_v4_i16(%struct.short4 %struct.riri3)
+  ret void
+}
+define void @st_param_v4_i16_rrii(i16 %a, i16 %b) {
+; CHECK-LABEL: st_param_v4_i16_rrii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_rrii_param_0];
+; CHECK-NEXT:    ld.param.u16 %rs2, [st_param_v4_i16_rrii_param_1];
 ; CHECK-NEXT:    { // callseq 48, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0+0], {%rs1, %rs2, 3, 4};
@@ -745,33 +1151,93 @@ define void @st_param_v4_i16(i16 %a, i16 %b, i16 %c, i16 %d) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 48
+; CHECK-NEXT:    ret;
+  %struct.rrii0 = insertvalue %struct.short4 poison, i16 %a, 0
+  %struct.rrii1 = insertvalue %struct.short4 %struct.rrii0, i16 %b, 1
+  %struct.rrii2 = insertvalue %struct.short4 %struct.rrii1, i16 3, 2
+  %struct.rrii3 = insertvalue %struct.short4 %struct.rrii2, i16 4, 3
+  call void @call_v4_i16(%struct.short4 %struct.rrii3)
+  ret void
+}
+define void @st_param_v4_i16_iiir(i16 %d) {
+; CHECK-LABEL: st_param_v4_i16_iiir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_iiir_param_0];
 ; CHECK-NEXT:    { // callseq 49, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, 2, 3, %rs4};
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, 2, 3, %rs1};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i16,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 49
-; CHECK-NEXT:    { // callseq 50, 0
-; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, 2, %rs3, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i16,
+; CHECK-NEXT:    ret;
+  %struct.iiir0 = insertvalue %struct.short4 poison, i16 1, 0
+  %struct.iiir1 = insertvalue %struct.short4 %struct.iiir0, i16 2, 1
+  %struct.iiir2 = insertvalue %struct.short4 %struct.iiir1, i16 3, 2
+  %struct.iiir3 = insertvalue %struct.short4 %struct.iiir2, i16 %d, 3
+  call void @call_v4_i16(%struct.short4 %struct.iiir3)
+  ret void
+}
+define void @st_param_v4_i16_iiri(i16 %c) {
+; CHECK-LABEL: st_param_v4_i16_iiri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_iiri_param_0];
+; CHECK-NEXT:    { // callseq 50, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, 2, %rs1, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i16,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 50
+; CHECK-NEXT:    ret;
+  %struct.iiri0 = insertvalue %struct.short4 poison, i16 1, 0
+  %struct.iiri1 = insertvalue %struct.short4 %struct.iiri0, i16 2, 1
+  %struct.iiri2 = insertvalue %struct.short4 %struct.iiri1, i16 %c, 2
+  %struct.iiri3 = insertvalue %struct.short4 %struct.iiri2, i16 4, 3
+  call void @call_v4_i16(%struct.short4 %struct.iiri3)
+  ret void
+}
+define void @st_param_v4_i16_irii(i16 %b) {
+; CHECK-LABEL: st_param_v4_i16_irii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_irii_param_0];
 ; CHECK-NEXT:    { // callseq 51, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, %rs2, 3, 4};
+; CHECK-NEXT:    st.param.v4.b16 [param0+0], {1, %rs1, 3, 4};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i16,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 51
+; CHECK-NEXT:    ret;
+  %struct.irii0 = insertvalue %struct.short4 poison, i16 1, 0
+  %struct.irii1 = insertvalue %struct.short4 %struct.irii0, i16 %b, 1
+  %struct.irii2 = insertvalue %struct.short4 %struct.irii1, i16 3, 2
+  %struct.irii3 = insertvalue %struct.short4 %struct.irii2, i16 4, 3
+  call void @call_v4_i16(%struct.short4 %struct.irii3)
+  ret void
+}
+define void @st_param_v4_i16_riii(i16 %a) {
+; CHECK-LABEL: st_param_v4_i16_riii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_riii_param_0];
 ; CHECK-NEXT:    { // callseq 52, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0+0], {%rs1, 2, 3, 4};
@@ -782,86 +1248,6 @@ define void @st_param_v4_i16(i16 %a, i16 %b, i16 %c, i16 %d) {
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 52
 ; CHECK-NEXT:    ret;
-  call void @call_v4_i16(%struct.short4 { i16 1, i16 2, i16 3, i16 4 })
-
-  %struct.irrr0 = insertvalue %struct.short4 poison, i16 1, 0
-  %struct.irrr1 = insertvalue %struct.short4 %struct.irrr0, i16 %b, 1
-  %struct.irrr2 = insertvalue %struct.short4 %struct.irrr1, i16 %c, 2
-  %struct.irrr3 = insertvalue %struct.short4 %struct.irrr2, i16 %d, 3
-  call void @call_v4_i16(%struct.short4 %struct.irrr3)
-
-  %struct.rirr0 = insertvalue %struct.short4 poison, i16 %a, 0
-  %struct.rirr1 = insertvalue %struct.short4 %struct.rirr0, i16 2, 1
-  %struct.rirr2 = insertvalue %struct.short4 %struct.rirr1, i16 %c, 2
-  %struct.rirr3 = insertvalue %struct.short4 %struct.rirr2, i16 %d, 3
-  call void @call_v4_i16(%struct.short4 %struct.rirr3)
-
-  %struct.rrir0 = insertvalue %struct.short4 poison, i16 %a, 0
-  %struct.rrir1 = insertvalue %struct.short4 %struct.rrir0, i16 %b, 1
-  %struct.rrir2 = insertvalue %struct.short4 %struct.rrir1, i16 3, 2
-  %struct.rrir3 = insertvalue %struct.short4 %struct.rrir2, i16 %d, 3
-  call void @call_v4_i16(%struct.short4 %struct.rrir3)
-
-  %struct.rrri0 = insertvalue %struct.short4 poison, i16 %a, 0
-  %struct.rrri1 = insertvalue %struct.short4 %struct.rrri0, i16 %b, 1
-  %struct.rrri2 = insertvalue %struct.short4 %struct.rrri1, i16 %c, 2
-  %struct.rrri3 = insertvalue %struct.short4 %struct.rrri2, i16 4, 3
-  call void @call_v4_i16(%struct.short4 %struct.rrri3)
-
-  %struct.iirr0 = insertvalue %struct.short4 poison, i16 1, 0
-  %struct.iirr1 = insertvalue %struct.short4 %struct.iirr0, i16 2, 1
-  %struct.iirr2 = insertvalue %struct.short4 %struct.iirr1, i16 %c, 2
-  %struct.iirr3 = insertvalue %struct.short4 %struct.iirr2, i16 %d, 3
-  call void @call_v4_i16(%struct.short4 %struct.iirr3)
-
-  %struct.irir0 = insertvalue %struct.short4 poison, i16 1, 0
-  %struct.irir1 = insertvalue %struct.short4 %struct.irir0, i16 %b, 1
-  %struct.irir2 = insertvalue %struct.short4 %struct.irir1, i16 3, 2
-  %struct.irir3 = insertvalue %struct.short4 %struct.irir2, i16 %d, 3
-  call void @call_v4_i16(%struct.short4 %struct.irir3)
-
-  %struct.irri0 = insertvalue %struct.short4 poison, i16 1, 0
-  %struct.irri1 = insertvalue %struct.short4 %struct.irri0, i16 %b, 1
-  %struct.irri2 = insertvalue %struct.short4 %struct.irri1, i16 %c, 2
-  %struct.irri3 = insertvalue %struct.short4 %struct.irri2, i16 4, 3
-  call void @call_v4_i16(%struct.short4 %struct.irri3)
-
-  %struct.riir0 = insertvalue %struct.short4 poison, i16 %a, 0
-  %struct.riir1 = insertvalue %struct.short4 %struct.riir0, i16 2, 1
-  %struct.riir2 = insertvalue %struct.short4 %struct.riir1, i16 3, 2
-  %struct.riir3 = insertvalue %struct.short4 %struct.riir2, i16 %d, 3
-  call void @call_v4_i16(%struct.short4 %struct.riir3)
-
-  %struct.riri0 = insertvalue %struct.short4 poison, i16 %a, 0
-  %struct.riri1 = insertvalue %struct.short4 %struct.riri0, i16 2, 1
-  %struct.riri2 = insertvalue %struct.short4 %struct.riri1, i16 %c, 2
-  %struct.riri3 = insertvalue %struct.short4 %struct.riri2, i16 4, 3
-  call void @call_v4_i16(%struct.short4 %struct.riri3)
-
-  %struct.rrii0 = insertvalue %struct.short4 poison, i16 %a, 0
-  %struct.rrii1 = insertvalue %struct.short4 %struct.rrii0, i16 %b, 1
-  %struct.rrii2 = insertvalue %struct.short4 %struct.rrii1, i16 3, 2
-  %struct.rrii3 = insertvalue %struct.short4 %struct.rrii2, i16 4, 3
-  call void @call_v4_i16(%struct.short4 %struct.rrii3)
-
-  %struct.iiir0 = insertvalue %struct.short4 poison, i16 1, 0
-  %struct.iiir1 = insertvalue %struct.short4 %struct.iiir0, i16 2, 1
-  %struct.iiir2 = insertvalue %struct.short4 %struct.iiir1, i16 3, 2
-  %struct.iiir3 = insertvalue %struct.short4 %struct.iiir2, i16 %d, 3
-  call void @call_v4_i16(%struct.short4 %struct.iiir3)
-
-  %struct.iiri0 = insertvalue %struct.short4 poison, i16 1, 0
-  %struct.iiri1 = insertvalue %struct.short4 %struct.iiri0, i16 2, 1
-  %struct.iiri2 = insertvalue %struct.short4 %struct.iiri1, i16 %c, 2
-  %struct.iiri3 = insertvalue %struct.short4 %struct.iiri2, i16 4, 3
-  call void @call_v4_i16(%struct.short4 %struct.iiri3)
-
-  %struct.irii0 = insertvalue %struct.short4 poison, i16 1, 0
-  %struct.irii1 = insertvalue %struct.short4 %struct.irii0, i16 %b, 1
-  %struct.irii2 = insertvalue %struct.short4 %struct.irii1, i16 3, 2
-  %struct.irii3 = insertvalue %struct.short4 %struct.irii2, i16 4, 3
-  call void @call_v4_i16(%struct.short4 %struct.irii3)
-
   %struct.riii0 = insertvalue %struct.short4 poison, i16 %a, 0
   %struct.riii1 = insertvalue %struct.short4 %struct.riii0, i16 2, 1
   %struct.riii2 = insertvalue %struct.short4 %struct.riii1, i16 3, 2
@@ -870,13 +1256,12 @@ define void @st_param_v4_i16(i16 %a, i16 %b, i16 %c, i16 %d) {
   ret void
 }
 
-define void @st_param_v4_i32(i32 %a, i32 %b, i32 %c, i32 %d) {
-; CHECK-LABEL: st_param_v4_i32(
+define void @st_param_v4_i32_iiii() {
+; CHECK-LABEL: st_param_v4_i32_iiii(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_param_0];
 ; CHECK-NEXT:    { // callseq 53, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, 2, 3, 4};
@@ -886,36 +1271,97 @@ define void @st_param_v4_i32(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 53
-; CHECK-NEXT:    ld.param.u32 %r2, [st_param_v4_i32_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [st_param_v4_i32_param_2];
-; CHECK-NEXT:    ld.param.u32 %r4, [st_param_v4_i32_param_3];
+; CHECK-NEXT:    ret;
+  call void @call_v4_i32(%struct.int4 { i32 1, i32 2, i32 3, i32 4 })
+  ret void
+}
+define void @st_param_v4_i32_irrr(i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: st_param_v4_i32_irrr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_irrr_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [st_param_v4_i32_irrr_param_1];
+; CHECK-NEXT:    ld.param.u32 %r3, [st_param_v4_i32_irrr_param_2];
 ; CHECK-NEXT:    { // callseq 54, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, %r2, %r3, %r4};
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, %r1, %r2, %r3};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i32,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 54
+; CHECK-NEXT:    ret;
+  %struct.irrr0 = insertvalue %struct.int4 poison, i32 1, 0
+  %struct.irrr1 = insertvalue %struct.int4 %struct.irrr0, i32 %b, 1
+  %struct.irrr2 = insertvalue %struct.int4 %struct.irrr1, i32 %c, 2
+  %struct.irrr3 = insertvalue %struct.int4 %struct.irrr2, i32 %d, 3
+  call void @call_v4_i32(%struct.int4 %struct.irrr3)
+  ret void
+}
+define void @st_param_v4_i32_rirr(i32 %a, i32 %c, i32 %d) {
+; CHECK-LABEL: st_param_v4_i32_rirr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_rirr_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [st_param_v4_i32_rirr_param_1];
+; CHECK-NEXT:    ld.param.u32 %r3, [st_param_v4_i32_rirr_param_2];
 ; CHECK-NEXT:    { // callseq 55, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0+0], {%r1, 2, %r3, %r4};
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {%r1, 2, %r2, %r3};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i32,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 55
+; CHECK-NEXT:    ret;
+  %struct.rirr0 = insertvalue %struct.int4 poison, i32 %a, 0
+  %struct.rirr1 = insertvalue %struct.int4 %struct.rirr0, i32 2, 1
+  %struct.rirr2 = insertvalue %struct.int4 %struct.rirr1, i32 %c, 2
+  %struct.rirr3 = insertvalue %struct.int4 %struct.rirr2, i32 %d, 3
+  call void @call_v4_i32(%struct.int4 %struct.rirr3)
+  ret void
+}
+define void @st_param_v4_i32_rrir(i32 %a, i32 %b, i32 %d) {
+; CHECK-LABEL: st_param_v4_i32_rrir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_rrir_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [st_param_v4_i32_rrir_param_1];
+; CHECK-NEXT:    ld.param.u32 %r3, [st_param_v4_i32_rrir_param_2];
 ; CHECK-NEXT:    { // callseq 56, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0+0], {%r1, %r2, 3, %r4};
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {%r1, %r2, 3, %r3};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i32,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 56
+; CHECK-NEXT:    ret;
+  %struct.rrir0 = insertvalue %struct.int4 poison, i32 %a, 0
+  %struct.rrir1 = insertvalue %struct.int4 %struct.rrir0, i32 %b, 1
+  %struct.rrir2 = insertvalue %struct.int4 %struct.rrir1, i32 3, 2
+  %struct.rrir3 = insertvalue %struct.int4 %struct.rrir2, i32 %d, 3
+  call void @call_v4_i32(%struct.int4 %struct.rrir3)
+  ret void
+}
+define void @st_param_v4_i32_rrri(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: st_param_v4_i32_rrri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_rrri_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [st_param_v4_i32_rrri_param_1];
+; CHECK-NEXT:    ld.param.u32 %r3, [st_param_v4_i32_rrri_param_2];
 ; CHECK-NEXT:    { // callseq 57, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0+0], {%r1, %r2, %r3, 4};
@@ -925,51 +1371,147 @@ define void @st_param_v4_i32(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 57
+; CHECK-NEXT:    ret;
+  %struct.rrri0 = insertvalue %struct.int4 poison, i32 %a, 0
+  %struct.rrri1 = insertvalue %struct.int4 %struct.rrri0, i32 %b, 1
+  %struct.rrri2 = insertvalue %struct.int4 %struct.rrri1, i32 %c, 2
+  %struct.rrri3 = insertvalue %struct.int4 %struct.rrri2, i32 4, 3
+  call void @call_v4_i32(%struct.int4 %struct.rrri3)
+  ret void
+}
+define void @st_param_v4_i32_iirr(i32 %c, i32 %d) {
+; CHECK-LABEL: st_param_v4_i32_iirr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_iirr_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [st_param_v4_i32_iirr_param_1];
 ; CHECK-NEXT:    { // callseq 58, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, 2, %r3, %r4};
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, 2, %r1, %r2};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i32,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 58
+; CHECK-NEXT:    ret;
+  %struct.iirr0 = insertvalue %struct.int4 poison, i32 1, 0
+  %struct.iirr1 = insertvalue %struct.int4 %struct.iirr0, i32 2, 1
+  %struct.iirr2 = insertvalue %struct.int4 %struct.iirr1, i32 %c, 2
+  %struct.iirr3 = insertvalue %struct.int4 %struct.iirr2, i32 %d, 3
+  call void @call_v4_i32(%struct.int4 %struct.iirr3)
+  ret void
+}
+define void @st_param_v4_i32_irir(i32 %b, i32 %d) {
+; CHECK-LABEL: st_param_v4_i32_irir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_irir_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [st_param_v4_i32_irir_param_1];
 ; CHECK-NEXT:    { // callseq 59, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, %r2, 3, %r4};
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, %r1, 3, %r2};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i32,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 59
+; CHECK-NEXT:    ret;
+  %struct.irir0 = insertvalue %struct.int4 poison, i32 1, 0
+  %struct.irir1 = insertvalue %struct.int4 %struct.irir0, i32 %b, 1
+  %struct.irir2 = insertvalue %struct.int4 %struct.irir1, i32 3, 2
+  %struct.irir3 = insertvalue %struct.int4 %struct.irir2, i32 %d, 3
+  call void @call_v4_i32(%struct.int4 %struct.irir3)
+  ret void
+}
+define void @st_param_v4_i32_irri(i32 %b, i32 %c) {
+; CHECK-LABEL: st_param_v4_i32_irri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_irri_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [st_param_v4_i32_irri_param_1];
 ; CHECK-NEXT:    { // callseq 60, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, %r2, %r3, 4};
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, %r1, %r2, 4};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i32,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 60
+; CHECK-NEXT:    ret;
+  %struct.irri0 = insertvalue %struct.int4 poison, i32 1, 0
+  %struct.irri1 = insertvalue %struct.int4 %struct.irri0, i32 %b, 1
+  %struct.irri2 = insertvalue %struct.int4 %struct.irri1, i32 %c, 2
+  %struct.irri3 = insertvalue %struct.int4 %struct.irri2, i32 4, 3
+  call void @call_v4_i32(%struct.int4 %struct.irri3)
+  ret void
+}
+define void @st_param_v4_i32_riir(i32 %a, i32 %d) {
+; CHECK-LABEL: st_param_v4_i32_riir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_riir_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [st_param_v4_i32_riir_param_1];
 ; CHECK-NEXT:    { // callseq 61, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0+0], {%r1, 2, 3, %r4};
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {%r1, 2, 3, %r2};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i32,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 61
+; CHECK-NEXT:    ret;
+  %struct.riir0 = insertvalue %struct.int4 poison, i32 %a, 0
+  %struct.riir1 = insertvalue %struct.int4 %struct.riir0, i32 2, 1
+  %struct.riir2 = insertvalue %struct.int4 %struct.riir1, i32 3, 2
+  %struct.riir3 = insertvalue %struct.int4 %struct.riir2, i32 %d, 3
+  call void @call_v4_i32(%struct.int4 %struct.riir3)
+  ret void
+}
+define void @st_param_v4_i32_riri(i32 %a, i32 %c) {
+; CHECK-LABEL: st_param_v4_i32_riri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_riri_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [st_param_v4_i32_riri_param_1];
 ; CHECK-NEXT:    { // callseq 62, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0+0], {%r1, 2, %r3, 4};
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {%r1, 2, %r2, 4};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i32,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 62
+; CHECK-NEXT:    ret;
+  %struct.riri0 = insertvalue %struct.int4 poison, i32 %a, 0
+  %struct.riri1 = insertvalue %struct.int4 %struct.riri0, i32 2, 1
+  %struct.riri2 = insertvalue %struct.int4 %struct.riri1, i32 %c, 2
+  %struct.riri3 = insertvalue %struct.int4 %struct.riri2, i32 4, 3
+  call void @call_v4_i32(%struct.int4 %struct.riri3)
+  ret void
+}
+define void @st_param_v4_i32_rrii(i32 %a, i32 %b) {
+; CHECK-LABEL: st_param_v4_i32_rrii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_rrii_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [st_param_v4_i32_rrii_param_1];
 ; CHECK-NEXT:    { // callseq 63, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0+0], {%r1, %r2, 3, 4};
@@ -979,123 +1521,103 @@ define void @st_param_v4_i32(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 63
+; CHECK-NEXT:    ret;
+  %struct.rrii0 = insertvalue %struct.int4 poison, i32 %a, 0
+  %struct.rrii1 = insertvalue %struct.int4 %struct.rrii0, i32 %b, 1
+  %struct.rrii2 = insertvalue %struct.int4 %struct.rrii1, i32 3, 2
+  %struct.rrii3 = insertvalue %struct.int4 %struct.rrii2, i32 4, 3
+  call void @call_v4_i32(%struct.int4 %struct.rrii3)
+  ret void
+}
+define void @st_param_v4_i32_iiir(i32 %d) {
+; CHECK-LABEL: st_param_v4_i32_iiir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_iiir_param_0];
 ; CHECK-NEXT:    { // callseq 64, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, 2, 3, %r4};
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, 2, 3, %r1};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i32,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 64
+; CHECK-NEXT:    ret;
+  %struct.iiir0 = insertvalue %struct.int4 poison, i32 1, 0
+  %struct.iiir1 = insertvalue %struct.int4 %struct.iiir0, i32 2, 1
+  %struct.iiir2 = insertvalue %struct.int4 %struct.iiir1, i32 3, 2
+  %struct.iiir3 = insertvalue %struct.int4 %struct.iiir2, i32 %d, 3
+  call void @call_v4_i32(%struct.int4 %struct.iiir3)
+  ret void
+}
+define void @st_param_v4_i32_iiri(i32 %c) {
+; CHECK-LABEL: st_param_v4_i32_iiri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_iiri_param_0];
 ; CHECK-NEXT:    { // callseq 65, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, 2, %r3, 4};
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, 2, %r1, 4};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_i32,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 65
-; CHECK-NEXT:    { // callseq 66, 0
-; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, %r2, 3, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
-; CHECK-NEXT:    } // callseq 66
-; CHECK-NEXT:    { // callseq 67, 0
-; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.b32 [param0+0], {%r1, 2, 3, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
-; CHECK-NEXT:    } // callseq 67
 ; CHECK-NEXT:    ret;
-  call void @call_v4_i32(%struct.int4 { i32 1, i32 2, i32 3, i32 4 })
-
-  %struct.irrr0 = insertvalue %struct.int4 poison, i32 1, 0
-  %struct.irrr1 = insertvalue %struct.int4 %struct.irrr0, i32 %b, 1
-  %struct.irrr2 = insertvalue %struct.int4 %struct.irrr1, i32 %c, 2
-  %struct.irrr3 = insertvalue %struct.int4 %struct.irrr2, i32 %d, 3
-  call void @call_v4_i32(%struct.int4 %struct.irrr3)
-
-  %struct.rirr0 = insertvalue %struct.int4 poison, i32 %a, 0
-  %struct.rirr1 = insertvalue %struct.int4 %struct.rirr0, i32 2, 1
-  %struct.rirr2 = insertvalue %struct.int4 %struct.rirr1, i32 %c, 2
-  %struct.rirr3 = insertvalue %struct.int4 %struct.rirr2, i32 %d, 3
-  call void @call_v4_i32(%struct.int4 %struct.rirr3)
-
-  %struct.rrir0 = insertvalue %struct.int4 poison, i32 %a, 0
-  %struct.rrir1 = insertvalue %struct.int4 %struct.rrir0, i32 %b, 1
-  %struct.rrir2 = insertvalue %struct.int4 %struct.rrir1, i32 3, 2
-  %struct.rrir3 = insertvalue %struct.int4 %struct.rrir2, i32 %d, 3
-  call void @call_v4_i32(%struct.int4 %struct.rrir3)
-
-  %struct.rrri0 = insertvalue %struct.int4 poison, i32 %a, 0
-  %struct.rrri1 = insertvalue %struct.int4 %struct.rrri0, i32 %b, 1
-  %struct.rrri2 = insertvalue %struct.int4 %struct.rrri1, i32 %c, 2
-  %struct.rrri3 = insertvalue %struct.int4 %struct.rrri2, i32 4, 3
-  call void @call_v4_i32(%struct.int4 %struct.rrri3)
-
-  %struct.iirr0 = insertvalue %struct.int4 poison, i32 1, 0
-  %struct.iirr1 = insertvalue %struct.int4 %struct.iirr0, i32 2, 1
-  %struct.iirr2 = insertvalue %struct.int4 %struct.iirr1, i32 %c, 2
-  %struct.iirr3 = insertvalue %struct.int4 %struct.iirr2, i32 %d, 3
-  call void @call_v4_i32(%struct.int4 %struct.iirr3)
-
-  %struct.irir0 = insertvalue %struct.int4 poison, i32 1, 0
-  %struct.irir1 = insertvalue %struct.int4 %struct.irir0, i32 %b, 1
-  %struct.irir2 = insertvalue %struct.int4 %struct.irir1, i32 3, 2
-  %struct.irir3 = insertvalue %struct.int4 %struct.irir2, i32 %d, 3
-  call void @call_v4_i32(%struct.int4 %struct.irir3)
-
-  %struct.irri0 = insertvalue %struct.int4 poison, i32 1, 0
-  %struct.irri1 = insertvalue %struct.int4 %struct.irri0, i32 %b, 1
-  %struct.irri2 = insertvalue %struct.int4 %struct.irri1, i32 %c, 2
-  %struct.irri3 = insertvalue %struct.int4 %struct.irri2, i32 4, 3
-  call void @call_v4_i32(%struct.int4 %struct.irri3)
-
-  %struct.riir0 = insertvalue %struct.int4 poison, i32 %a, 0
-  %struct.riir1 = insertvalue %struct.int4 %struct.riir0, i32 2, 1
-  %struct.riir2 = insertvalue %struct.int4 %struct.riir1, i32 3, 2
-  %struct.riir3 = insertvalue %struct.int4 %struct.riir2, i32 %d, 3
-  call void @call_v4_i32(%struct.int4 %struct.riir3)
-
-  %struct.riri0 = insertvalue %struct.int4 poison, i32 %a, 0
-  %struct.riri1 = insertvalue %struct.int4 %struct.riri0, i32 2, 1
-  %struct.riri2 = insertvalue %struct.int4 %struct.riri1, i32 %c, 2
-  %struct.riri3 = insertvalue %struct.int4 %struct.riri2, i32 4, 3
-  call void @call_v4_i32(%struct.int4 %struct.riri3)
-
-  %struct.rrii0 = insertvalue %struct.int4 poison, i32 %a, 0
-  %struct.rrii1 = insertvalue %struct.int4 %struct.rrii0, i32 %b, 1
-  %struct.rrii2 = insertvalue %struct.int4 %struct.rrii1, i32 3, 2
-  %struct.rrii3 = insertvalue %struct.int4 %struct.rrii2, i32 4, 3
-  call void @call_v4_i32(%struct.int4 %struct.rrii3)
-
-  %struct.iiir0 = insertvalue %struct.int4 poison, i32 1, 0
-  %struct.iiir1 = insertvalue %struct.int4 %struct.iiir0, i32 2, 1
-  %struct.iiir2 = insertvalue %struct.int4 %struct.iiir1, i32 3, 2
-  %struct.iiir3 = insertvalue %struct.int4 %struct.iiir2, i32 %d, 3
-  call void @call_v4_i32(%struct.int4 %struct.iiir3)
-
   %struct.iiri0 = insertvalue %struct.int4 poison, i32 1, 0
   %struct.iiri1 = insertvalue %struct.int4 %struct.iiri0, i32 2, 1
   %struct.iiri2 = insertvalue %struct.int4 %struct.iiri1, i32 %c, 2
   %struct.iiri3 = insertvalue %struct.int4 %struct.iiri2, i32 4, 3
   call void @call_v4_i32(%struct.int4 %struct.iiri3)
-
+  ret void
+}
+define void @st_param_v4_i32_irii(i32 %b) {
+; CHECK-LABEL: st_param_v4_i32_irii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_irii_param_0];
+; CHECK-NEXT:    { // callseq 66, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {1, %r1, 3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 66
+; CHECK-NEXT:    ret;
   %struct.irii0 = insertvalue %struct.int4 poison, i32 1, 0
   %struct.irii1 = insertvalue %struct.int4 %struct.irii0, i32 %b, 1
   %struct.irii2 = insertvalue %struct.int4 %struct.irii1, i32 3, 2
   %struct.irii3 = insertvalue %struct.int4 %struct.irii2, i32 4, 3
   call void @call_v4_i32(%struct.int4 %struct.irii3)
-
+  ret void
+}
+define void @st_param_v4_i32_riii(i32 %a) {
+; CHECK-LABEL: st_param_v4_i32_riii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_riii_param_0];
+; CHECK-NEXT:    { // callseq 67, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v4.b32 [param0+0], {%r1, 2, 3, 4};
+; CHECK-NEXT:    call.uni
+; CHECK-NEXT:    call_v4_i32,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    } // callseq 67
+; CHECK-NEXT:    ret;
   %struct.riii0 = insertvalue %struct.int4 poison, i32 %a, 0
   %struct.riii1 = insertvalue %struct.int4 %struct.riii0, i32 2, 1
   %struct.riii2 = insertvalue %struct.int4 %struct.riii1, i32 3, 2
@@ -1104,13 +1626,12 @@ define void @st_param_v4_i32(i32 %a, i32 %b, i32 %c, i32 %d) {
   ret void
 }
 
-define void @st_param_v4_f32(float %a, float %b, float %c, float %d) {
-; CHECK-LABEL: st_param_v4_f32(
+define void @st_param_v4_f32_iiii() {
+; CHECK-LABEL: st_param_v4_f32_iiii(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-EMPTY:
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_param_0];
 ; CHECK-NEXT:    { // callseq 68, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, 0f40400000, 0f40800000};
@@ -1120,36 +1641,97 @@ define void @st_param_v4_f32(float %a, float %b, float %c, float %d) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 68
-; CHECK-NEXT:    ld.param.f32 %f2, [st_param_v4_f32_param_1];
-; CHECK-NEXT:    ld.param.f32 %f3, [st_param_v4_f32_param_2];
-; CHECK-NEXT:    ld.param.f32 %f4, [st_param_v4_f32_param_3];
+; CHECK-NEXT:    ret;
+  call void @call_v4_f32(%struct.float4 { float 1.0, float 2.0, float 3.0, float 4.0 })
+  ret void
+}
+define void @st_param_v4_f32_irrr(float %b, float %c, float %d) {
+; CHECK-LABEL: st_param_v4_f32_irrr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_irrr_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [st_param_v4_f32_irrr_param_1];
+; CHECK-NEXT:    ld.param.f32 %f3, [st_param_v4_f32_irrr_param_2];
 ; CHECK-NEXT:    { // callseq 69, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, %f2, %f3, %f4};
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, %f1, %f2, %f3};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 69
+; CHECK-NEXT:    ret;
+  %struct.irrr0 = insertvalue %struct.float4 poison, float 1.0, 0
+  %struct.irrr1 = insertvalue %struct.float4 %struct.irrr0, float %b, 1
+  %struct.irrr2 = insertvalue %struct.float4 %struct.irrr1, float %c, 2
+  %struct.irrr3 = insertvalue %struct.float4 %struct.irrr2, float %d, 3
+  call void @call_v4_f32(%struct.float4 %struct.irrr3)
+  ret void
+}
+define void @st_param_v4_f32_rirr(float %a, float %c, float %d) {
+; CHECK-LABEL: st_param_v4_f32_rirr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_rirr_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [st_param_v4_f32_rirr_param_1];
+; CHECK-NEXT:    ld.param.f32 %f3, [st_param_v4_f32_rirr_param_2];
 ; CHECK-NEXT:    { // callseq 70, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0+0], {%f1, 0f40000000, %f3, %f4};
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {%f1, 0f40000000, %f2, %f3};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 70
+; CHECK-NEXT:    ret;
+  %struct.rirr0 = insertvalue %struct.float4 poison, float %a, 0
+  %struct.rirr1 = insertvalue %struct.float4 %struct.rirr0, float 2.0, 1
+  %struct.rirr2 = insertvalue %struct.float4 %struct.rirr1, float %c, 2
+  %struct.rirr3 = insertvalue %struct.float4 %struct.rirr2, float %d, 3
+  call void @call_v4_f32(%struct.float4 %struct.rirr3)
+  ret void
+}
+define void @st_param_v4_f32_rrir(float %a, float %b, float %d) {
+; CHECK-LABEL: st_param_v4_f32_rrir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_rrir_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [st_param_v4_f32_rrir_param_1];
+; CHECK-NEXT:    ld.param.f32 %f3, [st_param_v4_f32_rrir_param_2];
 ; CHECK-NEXT:    { // callseq 71, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0+0], {%f1, %f2, 0f40400000, %f4};
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {%f1, %f2, 0f40400000, %f3};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 71
+; CHECK-NEXT:    ret;
+  %struct.rrir0 = insertvalue %struct.float4 poison, float %a, 0
+  %struct.rrir1 = insertvalue %struct.float4 %struct.rrir0, float %b, 1
+  %struct.rrir2 = insertvalue %struct.float4 %struct.rrir1, float 3.0, 2
+  %struct.rrir3 = insertvalue %struct.float4 %struct.rrir2, float %d, 3
+  call void @call_v4_f32(%struct.float4 %struct.rrir3)
+  ret void
+}
+define void @st_param_v4_f32_rrri(float %a, float %b, float %c) {
+; CHECK-LABEL: st_param_v4_f32_rrri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_rrri_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [st_param_v4_f32_rrri_param_1];
+; CHECK-NEXT:    ld.param.f32 %f3, [st_param_v4_f32_rrri_param_2];
 ; CHECK-NEXT:    { // callseq 72, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.f32 [param0+0], {%f1, %f2, %f3, 0f40800000};
@@ -1159,51 +1741,147 @@ define void @st_param_v4_f32(float %a, float %b, float %c, float %d) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 72
+; CHECK-NEXT:    ret;
+  %struct.rrri0 = insertvalue %struct.float4 poison, float %a, 0
+  %struct.rrri1 = insertvalue %struct.float4 %struct.rrri0, float %b, 1
+  %struct.rrri2 = insertvalue %struct.float4 %struct.rrri1, float %c, 2
+  %struct.rrri3 = insertvalue %struct.float4 %struct.rrri2, float 4.0, 3
+  call void @call_v4_f32(%struct.float4 %struct.rrri3)
+  ret void
+}
+define void @st_param_v4_f32_iirr(float %c, float %d) {
+; CHECK-LABEL: st_param_v4_f32_iirr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_iirr_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [st_param_v4_f32_iirr_param_1];
 ; CHECK-NEXT:    { // callseq 73, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, %f3, %f4};
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, %f1, %f2};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 73
+; CHECK-NEXT:    ret;
+  %struct.iirr0 = insertvalue %struct.float4 poison, float 1.0, 0
+  %struct.iirr1 = insertvalue %struct.float4 %struct.iirr0, float 2.0, 1
+  %struct.iirr2 = insertvalue %struct.float4 %struct.iirr1, float %c, 2
+  %struct.iirr3 = insertvalue %struct.float4 %struct.iirr2, float %d, 3
+  call void @call_v4_f32(%struct.float4 %struct.iirr3)
+  ret void
+}
+define void @st_param_v4_f32_irir(float %b, float %d) {
+; CHECK-LABEL: st_param_v4_f32_irir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_irir_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [st_param_v4_f32_irir_param_1];
 ; CHECK-NEXT:    { // callseq 74, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, %f2, 0f40400000, %f4};
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, %f1, 0f40400000, %f2};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 74
+; CHECK-NEXT:    ret;
+  %struct.irir0 = insertvalue %struct.float4 poison, float 1.0, 0
+  %struct.irir1 = insertvalue %struct.float4 %struct.irir0, float %b, 1
+  %struct.irir2 = insertvalue %struct.float4 %struct.irir1, float 3.0, 2
+  %struct.irir3 = insertvalue %struct.float4 %struct.irir2, float %d, 3
+  call void @call_v4_f32(%struct.float4 %struct.irir3)
+  ret void
+}
+define void @st_param_v4_f32_irri(float %b, float %c) {
+; CHECK-LABEL: st_param_v4_f32_irri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_irri_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [st_param_v4_f32_irri_param_1];
 ; CHECK-NEXT:    { // callseq 75, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, %f2, %f3, 0f40800000};
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, %f1, %f2, 0f40800000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 75
+; CHECK-NEXT:    ret;
+  %struct.irri0 = insertvalue %struct.float4 poison, float 1.0, 0
+  %struct.irri1 = insertvalue %struct.float4 %struct.irri0, float %b, 1
+  %struct.irri2 = insertvalue %struct.float4 %struct.irri1, float %c, 2
+  %struct.irri3 = insertvalue %struct.float4 %struct.irri2, float 4.0, 3
+  call void @call_v4_f32(%struct.float4 %struct.irri3)
+  ret void
+}
+define void @st_param_v4_f32_riir(float %a, float %d) {
+; CHECK-LABEL: st_param_v4_f32_riir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_riir_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [st_param_v4_f32_riir_param_1];
 ; CHECK-NEXT:    { // callseq 76, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0+0], {%f1, 0f40000000, 0f40400000, %f4};
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {%f1, 0f40000000, 0f40400000, %f2};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 76
+; CHECK-NEXT:    ret;
+  %struct.riir0 = insertvalue %struct.float4 poison, float %a, 0
+  %struct.riir1 = insertvalue %struct.float4 %struct.riir0, float 2.0, 1
+  %struct.riir2 = insertvalue %struct.float4 %struct.riir1, float 3.0, 2
+  %struct.riir3 = insertvalue %struct.float4 %struct.riir2, float %d, 3
+  call void @call_v4_f32(%struct.float4 %struct.riir3)
+  ret void
+}
+define void @st_param_v4_f32_riri(float %a, float %c) {
+; CHECK-LABEL: st_param_v4_f32_riri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_riri_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [st_param_v4_f32_riri_param_1];
 ; CHECK-NEXT:    { // callseq 77, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0+0], {%f1, 0f40000000, %f3, 0f40800000};
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {%f1, 0f40000000, %f2, 0f40800000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 77
+; CHECK-NEXT:    ret;
+  %struct.riri0 = insertvalue %struct.float4 poison, float %a, 0
+  %struct.riri1 = insertvalue %struct.float4 %struct.riri0, float 2.0, 1
+  %struct.riri2 = insertvalue %struct.float4 %struct.riri1, float %c, 2
+  %struct.riri3 = insertvalue %struct.float4 %struct.riri2, float 4.0, 3
+  call void @call_v4_f32(%struct.float4 %struct.riri3)
+  ret void
+}
+define void @st_param_v4_f32_rrii(float %a, float %b) {
+; CHECK-LABEL: st_param_v4_f32_rrii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_rrii_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [st_param_v4_f32_rrii_param_1];
 ; CHECK-NEXT:    { // callseq 78, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.f32 [param0+0], {%f1, %f2, 0f40400000, 0f40800000};
@@ -1213,33 +1891,93 @@ define void @st_param_v4_f32(float %a, float %b, float %c, float %d) {
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 78
+; CHECK-NEXT:    ret;
+  %struct.rrii0 = insertvalue %struct.float4 poison, float %a, 0
+  %struct.rrii1 = insertvalue %struct.float4 %struct.rrii0, float %b, 1
+  %struct.rrii2 = insertvalue %struct.float4 %struct.rrii1, float 3.0, 2
+  %struct.rrii3 = insertvalue %struct.float4 %struct.rrii2, float 4.0, 3
+  call void @call_v4_f32(%struct.float4 %struct.rrii3)
+  ret void
+}
+define void @st_param_v4_f32_iiir(float %d) {
+; CHECK-LABEL: st_param_v4_f32_iiir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_iiir_param_0];
 ; CHECK-NEXT:    { // callseq 79, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, 0f40400000, %f4};
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, 0f40400000, %f1};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 79
+; CHECK-NEXT:    ret;
+  %struct.iiir0 = insertvalue %struct.float4 poison, float 1.0, 0
+  %struct.iiir1 = insertvalue %struct.float4 %struct.iiir0, float 2.0, 1
+  %struct.iiir2 = insertvalue %struct.float4 %struct.iiir1, float 3.0, 2
+  %struct.iiir3 = insertvalue %struct.float4 %struct.iiir2, float %d, 3
+  call void @call_v4_f32(%struct.float4 %struct.iiir3)
+  ret void
+}
+define void @st_param_v4_f32_iiri(float %c) {
+; CHECK-LABEL: st_param_v4_f32_iiri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_iiri_param_0];
 ; CHECK-NEXT:    { // callseq 80, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, %f3, 0f40800000};
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, %f1, 0f40800000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 80
+; CHECK-NEXT:    ret;
+  %struct.iiri0 = insertvalue %struct.float4 poison, float 1.0, 0
+  %struct.iiri1 = insertvalue %struct.float4 %struct.iiri0, float 2.0, 1
+  %struct.iiri2 = insertvalue %struct.float4 %struct.iiri1, float %c, 2
+  %struct.iiri3 = insertvalue %struct.float4 %struct.iiri2, float 4.0, 3
+  call void @call_v4_f32(%struct.float4 %struct.iiri3)
+  ret void
+}
+define void @st_param_v4_f32_irii(float %b) {
+; CHECK-LABEL: st_param_v4_f32_irii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_irii_param_0];
 ; CHECK-NEXT:    { // callseq 81, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, %f2, 0f40400000, 0f40800000};
+; CHECK-NEXT:    st.param.v4.f32 [param0+0], {0f3F800000, %f1, 0f40400000, 0f40800000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 81
+; CHECK-NEXT:    ret;
+  %struct.irii0 = insertvalue %struct.float4 poison, float 1.0, 0
+  %struct.irii1 = insertvalue %struct.float4 %struct.irii0, float %b, 1
+  %struct.irii2 = insertvalue %struct.float4 %struct.irii1, float 3.0, 2
+  %struct.irii3 = insertvalue %struct.float4 %struct.irii2, float 4.0, 3
+  call void @call_v4_f32(%struct.float4 %struct.irii3)
+  ret void
+}
+define void @st_param_v4_f32_riii(float %a) {
+; CHECK-LABEL: st_param_v4_f32_riii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_riii_param_0];
 ; CHECK-NEXT:    { // callseq 82, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.f32 [param0+0], {%f1, 0f40000000, 0f40400000, 0f40800000};
@@ -1250,86 +1988,6 @@ define void @st_param_v4_f32(float %a, float %b, float %c, float %d) {
 ; CHECK-NEXT:    );
 ; CHECK-NEXT:    } // callseq 82
 ; CHECK-NEXT:    ret;
-  call void @call_v4_f32(%struct.float4 { float 1.0, float 2.0, float 3.0, float 4.0 })
-
-  %struct.irrr0 = insertvalue %struct.float4 poison, float 1.0, 0
-  %struct.irrr1 = insertvalue %struct.float4 %struct.irrr0, float %b, 1
-  %struct.irrr2 = insertvalue %struct.float4 %struct.irrr1, float %c, 2
-  %struct.irrr3 = insertvalue %struct.float4 %struct.irrr2, float %d, 3
-  call void @call_v4_f32(%struct.float4 %struct.irrr3)
-
-  %struct.rirr0 = insertvalue %struct.float4 poison, float %a, 0
-  %struct.rirr1 = insertvalue %struct.float4 %struct.rirr0, float 2.0, 1
-  %struct.rirr2 = insertvalue %struct.float4 %struct.rirr1, float %c, 2
-  %struct.rirr3 = insertvalue %struct.float4 %struct.rirr2, float %d, 3
-  call void @call_v4_f32(%struct.float4 %struct.rirr3)
-
-  %struct.rrir0 = insertvalue %struct.float4 poison, float %a, 0
-  %struct.rrir1 = insertvalue %struct.float4 %struct.rrir0, float %b, 1
-  %struct.rrir2 = insertvalue %struct.float4 %struct.rrir1, float 3.0, 2
-  %struct.rrir3 = insertvalue %struct.float4 %struct.rrir2, float %d, 3
-  call void @call_v4_f32(%struct.float4 %struct.rrir3)
-
-  %struct.rrri0 = insertvalue %struct.float4 poison, float %a, 0
-  %struct.rrri1 = insertvalue %struct.float4 %struct.rrri0, float %b, 1
-  %struct.rrri2 = insertvalue %struct.float4 %struct.rrri1, float %c, 2
-  %struct.rrri3 = insertvalue %struct.float4 %struct.rrri2, float 4.0, 3
-  call void @call_v4_f32(%struct.float4 %struct.rrri3)
-
-  %struct.iirr0 = insertvalue %struct.float4 poison, float 1.0, 0
-  %struct.iirr1 = insertvalue %struct.float4 %struct.iirr0, float 2.0, 1
-  %struct.iirr2 = insertvalue %struct.float4 %struct.iirr1, float %c, 2
-  %struct.iirr3 = insertvalue %struct.float4 %struct.iirr2, float %d, 3
-  call void @call_v4_f32(%struct.float4 %struct.iirr3)
-
-  %struct.irir0 = insertvalue %struct.float4 poison, float 1.0, 0
-  %struct.irir1 = insertvalue %struct.float4 %struct.irir0, float %b, 1
-  %struct.irir2 = insertvalue %struct.float4 %struct.irir1, float 3.0, 2
-  %struct.irir3 = insertvalue %struct.float4 %struct.irir2, float %d, 3
-  call void @call_v4_f32(%struct.float4 %struct.irir3)
-
-  %struct.irri0 = insertvalue %struct.float4 poison, float 1.0, 0
-  %struct.irri1 = insertvalue %struct.float4 %struct.irri0, float %b, 1
-  %struct.irri2 = insertvalue %struct.float4 %struct.irri1, float %c, 2
-  %struct.irri3 = insertvalue %struct.float4 %struct.irri2, float 4.0, 3
-  call void @call_v4_f32(%struct.float4 %struct.irri3)
-
-  %struct.riir0 = insertvalue %struct.float4 poison, float %a, 0
-  %struct.riir1 = insertvalue %struct.float4 %struct.riir0, float 2.0, 1
-  %struct.riir2 = insertvalue %struct.float4 %struct.riir1, float 3.0, 2
-  %struct.riir3 = insertvalue %struct.float4 %struct.riir2, float %d, 3
-  call void @call_v4_f32(%struct.float4 %struct.riir3)
-
-  %struct.riri0 = insertvalue %struct.float4 poison, float %a, 0
-  %struct.riri1 = insertvalue %struct.float4 %struct.riri0, float 2.0, 1
-  %struct.riri2 = insertvalue %struct.float4 %struct.riri1, float %c, 2
-  %struct.riri3 = insertvalue %struct.float4 %struct.riri2, float 4.0, 3
-  call void @call_v4_f32(%struct.float4 %struct.riri3)
-
-  %struct.rrii0 = insertvalue %struct.float4 poison, float %a, 0
-  %struct.rrii1 = insertvalue %struct.float4 %struct.rrii0, float %b, 1
-  %struct.rrii2 = insertvalue %struct.float4 %struct.rrii1, float 3.0, 2
-  %struct.rrii3 = insertvalue %struct.float4 %struct.rrii2, float 4.0, 3
-  call void @call_v4_f32(%struct.float4 %struct.rrii3)
-
-  %struct.iiir0 = insertvalue %struct.float4 poison, float 1.0, 0
-  %struct.iiir1 = insertvalue %struct.float4 %struct.iiir0, float 2.0, 1
-  %struct.iiir2 = insertvalue %struct.float4 %struct.iiir1, float 3.0, 2
-  %struct.iiir3 = insertvalue %struct.float4 %struct.iiir2, float %d, 3
-  call void @call_v4_f32(%struct.float4 %struct.iiir3)
-
-  %struct.iiri0 = insertvalue %struct.float4 poison, float 1.0, 0
-  %struct.iiri1 = insertvalue %struct.float4 %struct.iiri0, float 2.0, 1
-  %struct.iiri2 = insertvalue %struct.float4 %struct.iiri1, float %c, 2
-  %struct.iiri3 = insertvalue %struct.float4 %struct.iiri2, float 4.0, 3
-  call void @call_v4_f32(%struct.float4 %struct.iiri3)
-
-  %struct.irii0 = insertvalue %struct.float4 poison, float 1.0, 0
-  %struct.irii1 = insertvalue %struct.float4 %struct.irii0, float %b, 1
-  %struct.irii2 = insertvalue %struct.float4 %struct.irii1, float 3.0, 2
-  %struct.irii3 = insertvalue %struct.float4 %struct.irii2, float 4.0, 3
-  call void @call_v4_f32(%struct.float4 %struct.irii3)
-
   %struct.riii0 = insertvalue %struct.float4 poison, float %a, 0
   %struct.riii1 = insertvalue %struct.float4 %struct.riii0, float 2.0, 1
   %struct.riii2 = insertvalue %struct.float4 %struct.riii1, float 3.0, 2

>From d1fb3df4288afd5db3b1eb7ce33638388cb54f7e Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean at nvidia.com>
Date: Tue, 14 May 2024 15:57:59 +0000
Subject: [PATCH 4/4] address comments

---
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp |  4 ++--
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td     | 23 +++++++++------------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 7e7d167d6a2c7..2713b6859ff3b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -2357,10 +2357,10 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
         default:
           break;
         case MVT::i32:
-          Opcode = NVPTX::StoreParamI8TruncI32;
+          Opcode = NVPTX::StoreParamI8TruncI32_r;
           break;
         case MVT::i64:
-          Opcode = NVPTX::StoreParamI8TruncI64;
+          Opcode = NVPTX::StoreParamI8TruncI64_r;
           break;
         }
       }
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 6c4badfeb742c..c4c35a1f74ba9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2637,18 +2637,15 @@ class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
                 [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
 
 let mayStore = true in {
-  class StoreParamInstReg<NVPTXRegClass regclass, string opstr> :
-        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
-                  "st.param" # opstr # " \t[param$a+$b], $val;",
-                  []>;
-
-  multiclass StoreParamInst<NVPTXRegClass regclass, Operand IMMType, string opstr> {
-    def _r: StoreParamInstReg<regclass, opstr>;
 
-    def _i:
-          NVPTXInst<(outs), (ins IMMType:$val, i32imm:$a, i32imm:$b),
-                    "st.param" # opstr # " \t[param$a+$b], $val;",
-                    []>;
+  multiclass StoreParamInst<NVPTXRegClass regclass, Operand IMMType, string opstr, bit support_imm = true> {
+    foreach op = [IMMType, regclass] in
+      if !or(support_imm, !isa<NVPTXRegClass>(op)) then
+        def _ # !if(!isa<NVPTXRegClass>(op), "r", "i")
+          : NVPTXInst<(outs),
+                      (ins op:$val, i32imm:$a, i32imm:$b),
+                      "st.param" # opstr # " \t[param$a+$b], $val;",
+                      []>;
   }
 
   multiclass StoreParamV2Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> {
@@ -2764,8 +2761,8 @@ defm StoreParamI32    : StoreParamInst<Int32Regs, i32imm, ".b32">;
 defm StoreParamI16    : StoreParamInst<Int16Regs, i16imm, ".b16">;
 defm StoreParamI8     : StoreParamInst<Int16Regs, i8imm,  ".b8">;
 
-def StoreParamI8TruncI32 : StoreParamInstReg<Int32Regs, ".b8">;
-def StoreParamI8TruncI64 : StoreParamInstReg<Int64Regs, ".b8">;
+defm StoreParamI8TruncI32 : StoreParamInst<Int32Regs, i8imm, ".b8", /* support_imm */ false>;
+defm StoreParamI8TruncI64 : StoreParamInst<Int64Regs, i8imm, ".b8", /* support_imm */ false>;
 
 defm StoreParamV2I64  : StoreParamV2Inst<Int64Regs, i64imm, ".b64">;
 defm StoreParamV2I32  : StoreParamV2Inst<Int32Regs, i32imm, ".b32">;



More information about the llvm-commits mailing list