[llvm] [NVPTX] support immediate values in st.param instructions (PR #91523)

Alex MacLean via llvm-commits llvm-commits at lists.llvm.org
Wed May 8 12:25:12 PDT 2024


https://github.com/AlexMaclean created https://github.com/llvm/llvm-project/pull/91523

Add support for generating `st.param` instructions with direct use of immediates. This eliminates the need for a `mov` instruction prior to the `st.param` resulting in more concise emitted PTX.

>From d56a4ceadc417dcc5b8f1d4caaf952af7c130887 Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean at nvidia.com>
Date: Wed, 8 May 2024 19:22:17 +0000
Subject: [PATCH] [NVPTX] support immediate values in st.param instructions

---
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 131 ++++-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td     |  99 ++--
 llvm/test/CodeGen/NVPTX/st-param-imm.ll     | 592 ++++++++++++++++++++
 3 files changed, 768 insertions(+), 54 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/st-param-imm.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 595395bb1b4b4..fc9d760093d66 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -2182,6 +2182,84 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
   return true;
 }
 
+// Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri)
+#define getOpcV2H(ty, op0, op1) NVPTX::StoreParamV2##ty##_##op0##op1
+
+#define getOpcV2H1(ty, op0, op1)                                               \
+  (op1) ? getOpcV2H(ty, op0, i) : getOpcV2H(ty, op0, r)
+
+#define getOpcodeForVectorStParamV2(ty, isimm)                                 \
+  (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1])
+
+#define getOpcV4H(ty, op0, op1, op2, op3)                                      \
+  NVPTX::StoreParamV4##ty##_##op0##op1##op2##op3
+
+#define getOpcV4H3(ty, op0, op1, op2, op3)                                     \
+  (op3) ? getOpcV4H(ty, op0, op1, op2, i) : getOpcV4H(ty, op0, op1, op2, r)
+
+#define getOpcV4H2(ty, op0, op1, op2, op3)                                     \
+  (op2) ? getOpcV4H3(ty, op0, op1, i, op3) : getOpcV4H3(ty, op0, op1, r, op3)
+
+#define getOpcV4H1(ty, op0, op1, op2, op3)                                     \
+  (op1) ? getOpcV4H2(ty, op0, i, op2, op3) : getOpcV4H2(ty, op0, r, op2, op3)
+
+#define getOpcodeForVectorStParamV4(ty, isimm)                                 \
+  (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3])                 \
+             : getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3])
+
+#define getOpcodeForVectorStParam(n, ty, isimm)                                \
+  (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm)                            \
+           : getOpcodeForVectorStParamV4(ty, isimm)
+
+static std::optional<unsigned>
+pickOpcodeForVectorStParam(SmallVector<SDValue, 8> &Ops, unsigned NumElts,
+                           MVT::SimpleValueType MemTy, SelectionDAG *CurDAG,
+                           SDLoc DL) {
+  // Determine which inputs are registers and immediates make new operators
+  // with constant values
+  SmallVector<bool, 4> IsImm(NumElts, false);
+  for (unsigned i = 0; i < NumElts; i++) {
+    IsImm[i] = (isa<ConstantSDNode>(Ops[i]) || isa<ConstantFPSDNode>(Ops[i]));
+    if (IsImm[i]) {
+      SDValue Imm = Ops[i];
+      if (MemTy == MVT::f32 || MemTy == MVT::f64) {
+        const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
+        const ConstantFP *CF = ConstImm->getConstantFPValue();
+        Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
+      } else {
+        const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
+        const ConstantInt *CI = ConstImm->getConstantIntValue();
+        Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
+      }
+      Ops[i] = Imm;
+    }
+  }
+
+  // Get opcode for MemTy, size, and register/immediate operand ordering
+  switch (MemTy) {
+  case MVT::i8:
+    return getOpcodeForVectorStParam(NumElts, I8, IsImm);
+  case MVT::i16:
+    return getOpcodeForVectorStParam(NumElts, I16, IsImm);
+  case MVT::i32:
+    return getOpcodeForVectorStParam(NumElts, I32, IsImm);
+  case MVT::i64:
+    if (NumElts == 4)
+      return std::nullopt;
+    return getOpcodeForVectorStParamV2(I64, IsImm);
+  case MVT::f32:
+    return getOpcodeForVectorStParam(NumElts, F32, IsImm);
+  case MVT::f64:
+    if (NumElts == 4)
+      return std::nullopt;
+    return getOpcodeForVectorStParamV2(F64, IsImm);
+  case MVT::f16:
+  case MVT::v2f16:
+  default:
+    return std::nullopt;
+  }
+}
+
 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
   SDLoc DL(N);
   SDValue Chain = N->getOperand(0);
@@ -2228,12 +2306,34 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
     switch (NumElts) {
     default:
       return false;
-    case 1:
-      Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
-                               NVPTX::StoreParamI8, NVPTX::StoreParamI16,
-                               NVPTX::StoreParamI32, NVPTX::StoreParamI64,
-                               NVPTX::StoreParamF32, NVPTX::StoreParamF64);
-      if (Opcode == NVPTX::StoreParamI8) {
+    case 1: {
+      MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;
+      SDValue Imm = Ops[0];
+      if (MemTy != MVT::f16 && MemTy != MVT::v2f16 &&
+          (isa<ConstantSDNode>(Imm) || isa<ConstantFPSDNode>(Imm))) {
+        // Convert immediate to target constant
+        if (MemTy == MVT::f32 || MemTy == MVT::f64) {
+          const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
+          const ConstantFP *CF = ConstImm->getConstantFPValue();
+          Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
+        } else {
+          const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
+          const ConstantInt *CI = ConstImm->getConstantIntValue();
+          Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
+        }
+        Ops[0] = Imm;
+        // Use immediate version of store param
+        Opcode = pickOpcodeForVT(MemTy, NVPTX::StoreParamI8_i,
+                                 NVPTX::StoreParamI16_i, NVPTX::StoreParamI32_i,
+                                 NVPTX::StoreParamI64_i, NVPTX::StoreParamF32_i,
+                                 NVPTX::StoreParamF64_i);
+      } else
+        Opcode =
+            pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
+                            NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r,
+                            NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r,
+                            NVPTX::StoreParamF32_r, NVPTX::StoreParamF64_r);
+      if (Opcode == NVPTX::StoreParamI8_r) {
         // Fine tune the opcode depending on the size of the operand.
         // This helps to avoid creating redundant COPY instructions in
         // InstrEmitter::AddRegisterOperand().
@@ -2249,19 +2349,14 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
         }
       }
       break;
+    }
     case 2:
-      Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
-                               NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16,
-                               NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64,
-                               NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64);
-      break;
-    case 4:
-      Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
-                               NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16,
-                               NVPTX::StoreParamV4I32, std::nullopt,
-                               NVPTX::StoreParamV4F32, std::nullopt);
+    case 4: {
+      MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;
+      Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL);
       break;
     }
+    }
     if (!Opcode)
       return false;
     break;
@@ -2269,7 +2364,7 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
   // conversion instruction first, and use that as the value operand to
   // the selected StoreParam node.
   case NVPTXISD::StoreParamU32: {
-    Opcode = NVPTX::StoreParamI32;
+    Opcode = NVPTX::StoreParamI32_r;
     SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
                                                 MVT::i32);
     SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
@@ -2278,7 +2373,7 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
     break;
   }
   case NVPTXISD::StoreParamS32: {
-    Opcode = NVPTX::StoreParamI32;
+    Opcode = NVPTX::StoreParamI32_r;
     SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
                                                 MVT::i32);
     SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 393fa29ff0516..6c4badfeb742c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2637,25 +2637,49 @@ class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
                 [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
 
 let mayStore = true in {
-  class StoreParamInst<NVPTXRegClass regclass, string opstr> :
+  class StoreParamInstReg<NVPTXRegClass regclass, string opstr> :
         NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
-                  !strconcat("st.param", opstr, " \t[param$a+$b], $val;"),
+                  "st.param" # opstr # " \t[param$a+$b], $val;",
                   []>;
 
-  class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
-        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
-                               i32imm:$a, i32imm:$b),
-                  !strconcat("st.param.v2", opstr,
-                             " \t[param$a+$b], {{$val, $val2}};"),
-                  []>;
+  multiclass StoreParamInst<NVPTXRegClass regclass, Operand IMMType, string opstr> {
+    def _r: StoreParamInstReg<regclass, opstr>;
 
-  class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
-        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3,
-                               regclass:$val4, i32imm:$a,
-                               i32imm:$b),
-                  !strconcat("st.param.v4", opstr,
-                             " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
-                  []>;
+    def _i:
+          NVPTXInst<(outs), (ins IMMType:$val, i32imm:$a, i32imm:$b),
+                    "st.param" # opstr # " \t[param$a+$b], $val;",
+                    []>;
+  }
+
+  multiclass StoreParamV2Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> {
+    foreach op1 = [IMMType, regclass] in
+      foreach op2 = [IMMType, regclass] in
+        def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i")
+              # !if(!isa<NVPTXRegClass>(op2), "r", "i")
+          : NVPTXInst<(outs),
+                      (ins op1:$val1, op2:$val2,
+                           i32imm:$a, i32imm:$b),
+                      "st.param.v2" # opstr # " \t[param$a+$b], {{$val1, $val2}};",
+                      []>;
+  }
+
+  multiclass StoreParamV4Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> {
+    foreach op1 = [IMMType, regclass] in
+      foreach op2 = [IMMType, regclass] in
+        foreach op3 = [IMMType, regclass] in
+          foreach op4 = [IMMType, regclass] in
+            def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i")
+                  # !if(!isa<NVPTXRegClass>(op2), "r", "i")
+                  # !if(!isa<NVPTXRegClass>(op3), "r", "i")
+                  # !if(!isa<NVPTXRegClass>(op4), "r", "i")
+
+              : NVPTXInst<(outs),
+                          (ins op1:$val1, op2:$val2, op3:$val3, op4:$val4,
+                               i32imm:$a, i32imm:$b),
+                          "st.param.v4" # opstr #
+                          " \t[param$a+$b], {{$val1, $val2, $val3, $val4}};",
+                          []>;
+  }
 
   class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
         NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
@@ -2735,27 +2759,30 @@ def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".f32">;
 def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".f64">;
 def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".f32">;
 
-def StoreParamI64    : StoreParamInst<Int64Regs, ".b64">;
-def StoreParamI32    : StoreParamInst<Int32Regs, ".b32">;
-
-def StoreParamI16    : StoreParamInst<Int16Regs, ".b16">;
-def StoreParamI8     : StoreParamInst<Int16Regs, ".b8">;
-def StoreParamI8TruncI32 : StoreParamInst<Int32Regs, ".b8">;
-def StoreParamI8TruncI64 : StoreParamInst<Int64Regs, ".b8">;
-def StoreParamV2I64  : StoreParamV2Inst<Int64Regs, ".b64">;
-def StoreParamV2I32  : StoreParamV2Inst<Int32Regs, ".b32">;
-def StoreParamV2I16  : StoreParamV2Inst<Int16Regs, ".b16">;
-def StoreParamV2I8   : StoreParamV2Inst<Int16Regs, ".b8">;
-
-def StoreParamV4I32  : StoreParamV4Inst<Int32Regs, ".b32">;
-def StoreParamV4I16  : StoreParamV4Inst<Int16Regs, ".b16">;
-def StoreParamV4I8   : StoreParamV4Inst<Int16Regs, ".b8">;
-
-def StoreParamF32      : StoreParamInst<Float32Regs, ".f32">;
-def StoreParamF64      : StoreParamInst<Float64Regs, ".f64">;
-def StoreParamV2F32    : StoreParamV2Inst<Float32Regs, ".f32">;
-def StoreParamV2F64    : StoreParamV2Inst<Float64Regs, ".f64">;
-def StoreParamV4F32    : StoreParamV4Inst<Float32Regs, ".f32">;
+defm StoreParamI64    : StoreParamInst<Int64Regs, i64imm, ".b64">;
+defm StoreParamI32    : StoreParamInst<Int32Regs, i32imm, ".b32">;
+defm StoreParamI16    : StoreParamInst<Int16Regs, i16imm, ".b16">;
+defm StoreParamI8     : StoreParamInst<Int16Regs, i8imm,  ".b8">;
+
+def StoreParamI8TruncI32 : StoreParamInstReg<Int32Regs, ".b8">;
+def StoreParamI8TruncI64 : StoreParamInstReg<Int64Regs, ".b8">;
+
+defm StoreParamV2I64  : StoreParamV2Inst<Int64Regs, i64imm, ".b64">;
+defm StoreParamV2I32  : StoreParamV2Inst<Int32Regs, i32imm, ".b32">;
+defm StoreParamV2I16  : StoreParamV2Inst<Int16Regs, i16imm, ".b16">;
+defm StoreParamV2I8   : StoreParamV2Inst<Int16Regs, i8imm,  ".b8">;
+
+defm StoreParamV4I32  : StoreParamV4Inst<Int32Regs, i32imm, ".b32">;
+defm StoreParamV4I16  : StoreParamV4Inst<Int16Regs, i16imm, ".b16">;
+defm StoreParamV4I8   : StoreParamV4Inst<Int16Regs, i8imm,  ".b8">;
+
+defm StoreParamF32    : StoreParamInst<Float32Regs, f32imm, ".f32">;
+defm StoreParamF64    : StoreParamInst<Float64Regs, f64imm, ".f64">;
+
+defm StoreParamV2F32  : StoreParamV2Inst<Float32Regs, f32imm, ".f32">;
+defm StoreParamV2F64  : StoreParamV2Inst<Float64Regs, f64imm, ".f64">;
+
+defm StoreParamV4F32  : StoreParamV4Inst<Float32Regs, f32imm, ".f32">;
 
 def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
 def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
diff --git a/llvm/test/CodeGen/NVPTX/st-param-imm.ll b/llvm/test/CodeGen/NVPTX/st-param-imm.ll
new file mode 100644
index 0000000000000..022035505e2a7
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/st-param-imm.ll
@@ -0,0 +1,592 @@
+; RUN: llc < %s -march=nvptx64 | FileCheck %s
+; RUN: llc < %s -march=nvptx | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -verify-machineinstrs | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -verify-machineinstrs | %ptxas-verify %}
+
+%struct.A = type { i8, i16 }
+%struct.char2 = type { i8, i8 }
+%struct.char4 = type { i8, i8, i8, i8 }
+%struct.short2 = type { i16, i16 }
+%struct.short4 = type { i16, i16, i16, i16 }
+%struct.int2 = type { i32, i32 }
+%struct.int4 = type { i32, i32, i32, i32 }
+%struct.longlong2 = type { i64, i64 }
+%struct.float2 = type { float, float }
+%struct.float4 = type { float, float, float, float }
+%struct.double2 = type { double, double }
+
+; CHECK-LABEL: st_param_i8_i16
+; CHECK: st.param.b8 [param0+0], 1
+; CHECK: st.param.b16 [param0+2], 2
+define void @st_param_i8_i16() {
+  call void @call_i8_i16(%struct.A { i8 1, i16 2 })
+  ret void
+}
+
+; CHECK-LABEL: st_param_i32
+; CHECK: st.param.b32 [param0+0], 3
+define void @st_param_i32() {
+  call void @call_i32(i32 3)
+  ret void
+}
+
+; CHECK-LABEL: st_param_i64
+; CHECK: st.param.b64 [param0+0], 4
+define void @st_param_i64() {
+  call void @call_i64(i64 4)
+  ret void
+}
+
+; CHECK-LABEL: st_param_f32
+; CHECK: st.param.f32 [param0+0], 0f40A00000
+define void @st_param_f32() {
+  call void @call_f32(float 5.0)
+  ret void
+}
+
+; CHECK-LABEL: st_param_f64
+; CHECK: st.param.f64 [param0+0], 0d4018000000000000
+define void @st_param_f64() {
+  call void @call_f64(double 6.0)
+  ret void
+}
+
+declare void @call_i8_i16(%struct.A)
+declare void @call_i32(i32)
+declare void @call_i64(i64)
+declare void @call_f32(float)
+declare void @call_f64(double)
+
+; CHECK-LABEL: st_param_v2_i8
+; CHECK: st.param.v2.b8 [param0+0], {1, 2}
+; CHECK: st.param.v2.b8 [param0+0], {1, {{%rs[0-9]+}}}
+; CHECK: st.param.v2.b8 [param0+0], {{{%rs[0-9]+}}, 2}
+define void @st_param_v2_i8(i8 %val) {
+  call void @call_v2_i8(%struct.char2 { i8 1, i8 2 })
+  %struct.ir0 = insertvalue %struct.char2 poison, i8 1, 0
+  %struct.ir1 = insertvalue %struct.char2 %struct.ir0, i8 %val, 1
+  call void @call_v2_i8(%struct.char2 %struct.ir1)
+  %struct.ri0 = insertvalue %struct.char2 poison, i8 %val, 0
+  %struct.ri1 = insertvalue %struct.char2 %struct.ri0, i8 2, 1
+  call void @call_v2_i8(%struct.char2 %struct.ri1)
+  ret void
+}
+
+; CHECK-LABEL: st_param_v2_i16
+; CHECK: st.param.v2.b16 [param0+0], {1, 2}
+; CHECK: st.param.v2.b16 [param0+0], {1, {{%rs[0-9]+}}}
+; CHECK: st.param.v2.b16 [param0+0], {{{%rs[0-9]+}}, 2}
+define void @st_param_v2_i16(i16 %val) {
+  call void @call_v2_i16(%struct.short2 { i16 1, i16 2 })
+  %struct.ir0 = insertvalue %struct.short2 poison, i16 1, 0
+  %struct.ir1 = insertvalue %struct.short2 %struct.ir0, i16 %val, 1
+  call void @call_v2_i16(%struct.short2 %struct.ir1)
+  %struct.ri0 = insertvalue %struct.short2 poison, i16 %val, 0
+  %struct.ri1 = insertvalue %struct.short2 %struct.ri0, i16 2, 1
+  call void @call_v2_i16(%struct.short2 %struct.ri1)
+  ret void
+}
+
+; CHECK-LABEL: st_param_v2_i32
+; CHECK: st.param.v2.b32 [param0+0], {1, 2}
+; CHECK: st.param.v2.b32 [param0+0], {1, {{%r[0-9]+}}}
+; CHECK: st.param.v2.b32 [param0+0], {{{%r[0-9]+}}, 2}
+define void @st_param_v2_i32(i32 %val) {
+  call void @call_v2_i32(%struct.int2 { i32 1, i32 2 })
+  %struct.ir0 = insertvalue %struct.int2 poison, i32 1, 0
+  %struct.ir1 = insertvalue %struct.int2 %struct.ir0, i32 %val, 1
+  call void @call_v2_i32(%struct.int2 %struct.ir1)
+  %struct.ri0 = insertvalue %struct.int2 poison, i32 %val, 0
+  %struct.ri1 = insertvalue %struct.int2 %struct.ri0, i32 2, 1
+  call void @call_v2_i32(%struct.int2 %struct.ri1)
+  ret void
+}
+
+; CHECK-LABEL: st_param_v2_i64
+; CHECK: st.param.v2.b64 [param0+0], {1, 2}
+; CHECK: st.param.v2.b64 [param0+0], {1, {{%rd[0-9]+}}}
+; CHECK: st.param.v2.b64 [param0+0], {{{%rd[0-9]+}}, 2}
+define void @st_param_v2_i64(i64 %val) {
+  call void @call_v2_i64(%struct.longlong2 { i64 1, i64 2 })
+  %struct.ir0 = insertvalue %struct.longlong2 poison, i64 1, 0
+  %struct.ir1 = insertvalue %struct.longlong2 %struct.ir0, i64 %val, 1
+  call void @call_v2_i64(%struct.longlong2 %struct.ir1)
+  %struct.ri0 = insertvalue %struct.longlong2 poison, i64 %val, 0
+  %struct.ri1 = insertvalue %struct.longlong2 %struct.ri0, i64 2, 1
+  call void @call_v2_i64(%struct.longlong2 %struct.ri1)
+  ret void
+}
+
+; CHECK-LABEL: st_param_v2_f32
+; CHECK: st.param.v2.f32 [param0+0], {0f3F800000, 0f40000000}
+; CHECK: st.param.v2.f32 [param0+0], {0f3F800000, {{%f[0-9]+}}}
+; CHECK: st.param.v2.f32 [param0+0], {{{%f[0-9]+}}, 0f40000000}
+define void @st_param_v2_f32(float %val) {
+  call void @call_v2_f32(%struct.float2 { float 1.0, float 2.0 })
+  %struct.ir0 = insertvalue %struct.float2 poison, float 1.0, 0
+  %struct.ir1 = insertvalue %struct.float2 %struct.ir0, float %val, 1
+  call void @call_v2_f32(%struct.float2 %struct.ir1)
+  %struct.ri0 = insertvalue %struct.float2 poison, float %val, 0
+  %struct.ri1 = insertvalue %struct.float2 %struct.ri0, float 2.0, 1
+  call void @call_v2_f32(%struct.float2 %struct.ri1)
+  ret void
+}
+
+; CHECK-LABEL: st_param_v2_f64
+; CHECK: st.param.v2.f64 [param0+0], {0d3FF0000000000000, 0d4000000000000000}
+; CHECK: st.param.v2.f64 [param0+0], {0d3FF0000000000000, {{%fd[0-9]+}}}
+; CHECK: st.param.v2.f64 [param0+0], {{{%fd[0-9]+}}, 0d4000000000000000}
+define void @st_param_v2_f64(double %val) {
+  call void @call_v2_f64(%struct.double2 { double 1.0, double 2.0 })
+  %struct.ir0 = insertvalue %struct.double2 poison, double 1.0, 0
+  %struct.ir1 = insertvalue %struct.double2 %struct.ir0, double %val, 1
+  call void @call_v2_f64(%struct.double2 %struct.ir1)
+  %struct.ri0 = insertvalue %struct.double2 poison, double %val, 0
+  %struct.ri1 = insertvalue %struct.double2 %struct.ri0, double 2.0, 1
+  call void @call_v2_f64(%struct.double2 %struct.ri1)
+  ret void
+}
+
+declare void @call_v2_i8(%struct.char2)
+declare void @call_v2_i16(%struct.short2)
+declare void @call_v2_i32(%struct.int2)
+declare void @call_v2_i64(%struct.longlong2)
+declare void @call_v2_f32(%struct.float2)
+declare void @call_v2_f64(%struct.double2)
+
+; CHECK-LABEL: st_param_v4_i8
+; CHECK: st.param.v4.b8 [param0+0], {1, 2, 3, 4}
+; CHECK: st.param.v4.b8 [param0+0], {1, {{%rs[0-9]+}}, {{%rs[0-9]+}}, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b8 [param0+0], {{{%rs[0-9]+}}, 2, {{%rs[0-9]+}}, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b8 [param0+0], {{{%rs[0-9]+}}, {{%rs[0-9]+}}, 3, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b8 [param0+0], {{{%rs[0-9]+}}, {{%rs[0-9]+}}, {{%rs[0-9]+}}, 4}
+; CHECK: st.param.v4.b8 [param0+0], {1, 2, {{%rs[0-9]+}}, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b8 [param0+0], {1, {{%rs[0-9]+}}, 3, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b8 [param0+0], {1, {{%rs[0-9]+}}, {{%rs[0-9]+}}, 4}
+; CHECK: st.param.v4.b8 [param0+0], {{{%rs[0-9]+}}, 2, 3, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b8 [param0+0], {{{%rs[0-9]+}}, 2, {{%rs[0-9]+}}, 4}
+; CHECK: st.param.v4.b8 [param0+0], {{{%rs[0-9]+}}, {{%rs[0-9]+}}, 3, 4}
+; CHECK: st.param.v4.b8 [param0+0], {1, 2, 3, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b8 [param0+0], {1, 2, {{%rs[0-9]+}}, 4}
+; CHECK: st.param.v4.b8 [param0+0], {1, {{%rs[0-9]+}}, 3, 4}
+; CHECK: st.param.v4.b8 [param0+0], {{{%rs[0-9]+}}, 2, 3, 4}
+define void @st_param_v4_i8(i8 %a, i8 %b, i8 %c, i8 %d) {
+  call void @call_v4_i8(%struct.char4 { i8 1, i8 2, i8 3, i8 4 })
+
+  %struct.irrr0 = insertvalue %struct.char4 poison, i8 1, 0
+  %struct.irrr1 = insertvalue %struct.char4 %struct.irrr0, i8 %b, 1
+  %struct.irrr2 = insertvalue %struct.char4 %struct.irrr1, i8 %c, 2
+  %struct.irrr3 = insertvalue %struct.char4 %struct.irrr2, i8 %d, 3
+  call void @call_v4_i8(%struct.char4 %struct.irrr3)
+
+  %struct.rirr0 = insertvalue %struct.char4 poison, i8 %a, 0
+  %struct.rirr1 = insertvalue %struct.char4 %struct.rirr0, i8 2, 1
+  %struct.rirr2 = insertvalue %struct.char4 %struct.rirr1, i8 %c, 2
+  %struct.rirr3 = insertvalue %struct.char4 %struct.rirr2, i8 %d, 3
+  call void @call_v4_i8(%struct.char4 %struct.rirr3)
+
+  %struct.rrir0 = insertvalue %struct.char4 poison, i8 %a, 0
+  %struct.rrir1 = insertvalue %struct.char4 %struct.rrir0, i8 %b, 1
+  %struct.rrir2 = insertvalue %struct.char4 %struct.rrir1, i8 3, 2
+  %struct.rrir3 = insertvalue %struct.char4 %struct.rrir2, i8 %d, 3
+  call void @call_v4_i8(%struct.char4 %struct.rrir3)
+
+  %struct.rrri0 = insertvalue %struct.char4 poison, i8 %a, 0
+  %struct.rrri1 = insertvalue %struct.char4 %struct.rrri0, i8 %b, 1
+  %struct.rrri2 = insertvalue %struct.char4 %struct.rrri1, i8 %c, 2
+  %struct.rrri3 = insertvalue %struct.char4 %struct.rrri2, i8 4, 3
+  call void @call_v4_i8(%struct.char4 %struct.rrri3)
+
+  %struct.iirr0 = insertvalue %struct.char4 poison, i8 1, 0
+  %struct.iirr1 = insertvalue %struct.char4 %struct.iirr0, i8 2, 1
+  %struct.iirr2 = insertvalue %struct.char4 %struct.iirr1, i8 %c, 2
+  %struct.iirr3 = insertvalue %struct.char4 %struct.iirr2, i8 %d, 3
+  call void @call_v4_i8(%struct.char4 %struct.iirr3)
+
+  %struct.irir0 = insertvalue %struct.char4 poison, i8 1, 0
+  %struct.irir1 = insertvalue %struct.char4 %struct.irir0, i8 %b, 1
+  %struct.irir2 = insertvalue %struct.char4 %struct.irir1, i8 3, 2
+  %struct.irir3 = insertvalue %struct.char4 %struct.irir2, i8 %d, 3
+  call void @call_v4_i8(%struct.char4 %struct.irir3)
+
+  %struct.irri0 = insertvalue %struct.char4 poison, i8 1, 0
+  %struct.irri1 = insertvalue %struct.char4 %struct.irri0, i8 %b, 1
+  %struct.irri2 = insertvalue %struct.char4 %struct.irri1, i8 %c, 2
+  %struct.irri3 = insertvalue %struct.char4 %struct.irri2, i8 4, 3
+  call void @call_v4_i8(%struct.char4 %struct.irri3)
+
+  %struct.riir0 = insertvalue %struct.char4 poison, i8 %a, 0
+  %struct.riir1 = insertvalue %struct.char4 %struct.riir0, i8 2, 1
+  %struct.riir2 = insertvalue %struct.char4 %struct.riir1, i8 3, 2
+  %struct.riir3 = insertvalue %struct.char4 %struct.riir2, i8 %d, 3
+  call void @call_v4_i8(%struct.char4 %struct.riir3)
+
+  %struct.riri0 = insertvalue %struct.char4 poison, i8 %a, 0
+  %struct.riri1 = insertvalue %struct.char4 %struct.riri0, i8 2, 1
+  %struct.riri2 = insertvalue %struct.char4 %struct.riri1, i8 %c, 2
+  %struct.riri3 = insertvalue %struct.char4 %struct.riri2, i8 4, 3
+  call void @call_v4_i8(%struct.char4 %struct.riri3)
+
+  %struct.rrii0 = insertvalue %struct.char4 poison, i8 %a, 0
+  %struct.rrii1 = insertvalue %struct.char4 %struct.rrii0, i8 %b, 1
+  %struct.rrii2 = insertvalue %struct.char4 %struct.rrii1, i8 3, 2
+  %struct.rrii3 = insertvalue %struct.char4 %struct.rrii2, i8 4, 3
+  call void @call_v4_i8(%struct.char4 %struct.rrii3)
+
+  %struct.iiir0 = insertvalue %struct.char4 poison, i8 1, 0
+  %struct.iiir1 = insertvalue %struct.char4 %struct.iiir0, i8 2, 1
+  %struct.iiir2 = insertvalue %struct.char4 %struct.iiir1, i8 3, 2
+  %struct.iiir3 = insertvalue %struct.char4 %struct.iiir2, i8 %d, 3
+  call void @call_v4_i8(%struct.char4 %struct.iiir3)
+
+  %struct.iiri0 = insertvalue %struct.char4 poison, i8 1, 0
+  %struct.iiri1 = insertvalue %struct.char4 %struct.iiri0, i8 2, 1
+  %struct.iiri2 = insertvalue %struct.char4 %struct.iiri1, i8 %c, 2
+  %struct.iiri3 = insertvalue %struct.char4 %struct.iiri2, i8 4, 3
+  call void @call_v4_i8(%struct.char4 %struct.iiri3)
+
+  %struct.irii0 = insertvalue %struct.char4 poison, i8 1, 0
+  %struct.irii1 = insertvalue %struct.char4 %struct.irii0, i8 %b, 1
+  %struct.irii2 = insertvalue %struct.char4 %struct.irii1, i8 3, 2
+  %struct.irii3 = insertvalue %struct.char4 %struct.irii2, i8 4, 3
+  call void @call_v4_i8(%struct.char4 %struct.irii3)
+
+  %struct.riii0 = insertvalue %struct.char4 poison, i8 %a, 0
+  %struct.riii1 = insertvalue %struct.char4 %struct.riii0, i8 2, 1
+  %struct.riii2 = insertvalue %struct.char4 %struct.riii1, i8 3, 2
+  %struct.riii3 = insertvalue %struct.char4 %struct.riii2, i8 4, 3
+  call void @call_v4_i8(%struct.char4 %struct.riii3)
+  ret void
+}
+
+; CHECK-LABEL: st_param_v4_i16
+; CHECK: st.param.v4.b16 [param0+0], {1, 2, 3, 4}
+; CHECK: st.param.v4.b16 [param0+0], {1, {{%rs[0-9]+}}, {{%rs[0-9]+}}, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b16 [param0+0], {{{%rs[0-9]+}}, 2, {{%rs[0-9]+}}, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b16 [param0+0], {{{%rs[0-9]+}}, {{%rs[0-9]+}}, 3, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b16 [param0+0], {{{%rs[0-9]+}}, {{%rs[0-9]+}}, {{%rs[0-9]+}}, 4}
+; CHECK: st.param.v4.b16 [param0+0], {1, 2, {{%rs[0-9]+}}, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b16 [param0+0], {1, {{%rs[0-9]+}}, 3, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b16 [param0+0], {1, {{%rs[0-9]+}}, {{%rs[0-9]+}}, 4}
+; CHECK: st.param.v4.b16 [param0+0], {{{%rs[0-9]+}}, 2, 3, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b16 [param0+0], {{{%rs[0-9]+}}, 2, {{%rs[0-9]+}}, 4}
+; CHECK: st.param.v4.b16 [param0+0], {{{%rs[0-9]+}}, {{%rs[0-9]+}}, 3, 4}
+; CHECK: st.param.v4.b16 [param0+0], {1, 2, 3, {{%rs[0-9]+}}}
+; CHECK: st.param.v4.b16 [param0+0], {1, 2, {{%rs[0-9]+}}, 4}
+; CHECK: st.param.v4.b16 [param0+0], {1, {{%rs[0-9]+}}, 3, 4}
+; CHECK: st.param.v4.b16 [param0+0], {{{%rs[0-9]+}}, 2, 3, 4}
+define void @st_param_v4_i16(i16 %a, i16 %b, i16 %c, i16 %d) {
+  call void @call_v4_i16(%struct.short4 { i16 1, i16 2, i16 3, i16 4 })
+
+  %struct.irrr0 = insertvalue %struct.short4 poison, i16 1, 0
+  %struct.irrr1 = insertvalue %struct.short4 %struct.irrr0, i16 %b, 1
+  %struct.irrr2 = insertvalue %struct.short4 %struct.irrr1, i16 %c, 2
+  %struct.irrr3 = insertvalue %struct.short4 %struct.irrr2, i16 %d, 3
+  call void @call_v4_i16(%struct.short4 %struct.irrr3)
+
+  %struct.rirr0 = insertvalue %struct.short4 poison, i16 %a, 0
+  %struct.rirr1 = insertvalue %struct.short4 %struct.rirr0, i16 2, 1
+  %struct.rirr2 = insertvalue %struct.short4 %struct.rirr1, i16 %c, 2
+  %struct.rirr3 = insertvalue %struct.short4 %struct.rirr2, i16 %d, 3
+  call void @call_v4_i16(%struct.short4 %struct.rirr3)
+
+  %struct.rrir0 = insertvalue %struct.short4 poison, i16 %a, 0
+  %struct.rrir1 = insertvalue %struct.short4 %struct.rrir0, i16 %b, 1
+  %struct.rrir2 = insertvalue %struct.short4 %struct.rrir1, i16 3, 2
+  %struct.rrir3 = insertvalue %struct.short4 %struct.rrir2, i16 %d, 3
+  call void @call_v4_i16(%struct.short4 %struct.rrir3)
+
+  %struct.rrri0 = insertvalue %struct.short4 poison, i16 %a, 0
+  %struct.rrri1 = insertvalue %struct.short4 %struct.rrri0, i16 %b, 1
+  %struct.rrri2 = insertvalue %struct.short4 %struct.rrri1, i16 %c, 2
+  %struct.rrri3 = insertvalue %struct.short4 %struct.rrri2, i16 4, 3
+  call void @call_v4_i16(%struct.short4 %struct.rrri3)
+
+  %struct.iirr0 = insertvalue %struct.short4 poison, i16 1, 0
+  %struct.iirr1 = insertvalue %struct.short4 %struct.iirr0, i16 2, 1
+  %struct.iirr2 = insertvalue %struct.short4 %struct.iirr1, i16 %c, 2
+  %struct.iirr3 = insertvalue %struct.short4 %struct.iirr2, i16 %d, 3
+  call void @call_v4_i16(%struct.short4 %struct.iirr3)
+
+  %struct.irir0 = insertvalue %struct.short4 poison, i16 1, 0
+  %struct.irir1 = insertvalue %struct.short4 %struct.irir0, i16 %b, 1
+  %struct.irir2 = insertvalue %struct.short4 %struct.irir1, i16 3, 2
+  %struct.irir3 = insertvalue %struct.short4 %struct.irir2, i16 %d, 3
+  call void @call_v4_i16(%struct.short4 %struct.irir3)
+
+  %struct.irri0 = insertvalue %struct.short4 poison, i16 1, 0
+  %struct.irri1 = insertvalue %struct.short4 %struct.irri0, i16 %b, 1
+  %struct.irri2 = insertvalue %struct.short4 %struct.irri1, i16 %c, 2
+  %struct.irri3 = insertvalue %struct.short4 %struct.irri2, i16 4, 3
+  call void @call_v4_i16(%struct.short4 %struct.irri3)
+
+  %struct.riir0 = insertvalue %struct.short4 poison, i16 %a, 0
+  %struct.riir1 = insertvalue %struct.short4 %struct.riir0, i16 2, 1
+  %struct.riir2 = insertvalue %struct.short4 %struct.riir1, i16 3, 2
+  %struct.riir3 = insertvalue %struct.short4 %struct.riir2, i16 %d, 3
+  call void @call_v4_i16(%struct.short4 %struct.riir3)
+
+  %struct.riri0 = insertvalue %struct.short4 poison, i16 %a, 0
+  %struct.riri1 = insertvalue %struct.short4 %struct.riri0, i16 2, 1
+  %struct.riri2 = insertvalue %struct.short4 %struct.riri1, i16 %c, 2
+  %struct.riri3 = insertvalue %struct.short4 %struct.riri2, i16 4, 3
+  call void @call_v4_i16(%struct.short4 %struct.riri3)
+
+  %struct.rrii0 = insertvalue %struct.short4 poison, i16 %a, 0
+  %struct.rrii1 = insertvalue %struct.short4 %struct.rrii0, i16 %b, 1
+  %struct.rrii2 = insertvalue %struct.short4 %struct.rrii1, i16 3, 2
+  %struct.rrii3 = insertvalue %struct.short4 %struct.rrii2, i16 4, 3
+  call void @call_v4_i16(%struct.short4 %struct.rrii3)
+
+  %struct.iiir0 = insertvalue %struct.short4 poison, i16 1, 0
+  %struct.iiir1 = insertvalue %struct.short4 %struct.iiir0, i16 2, 1
+  %struct.iiir2 = insertvalue %struct.short4 %struct.iiir1, i16 3, 2
+  %struct.iiir3 = insertvalue %struct.short4 %struct.iiir2, i16 %d, 3
+  call void @call_v4_i16(%struct.short4 %struct.iiir3)
+
+  %struct.iiri0 = insertvalue %struct.short4 poison, i16 1, 0
+  %struct.iiri1 = insertvalue %struct.short4 %struct.iiri0, i16 2, 1
+  %struct.iiri2 = insertvalue %struct.short4 %struct.iiri1, i16 %c, 2
+  %struct.iiri3 = insertvalue %struct.short4 %struct.iiri2, i16 4, 3
+  call void @call_v4_i16(%struct.short4 %struct.iiri3)
+
+  %struct.irii0 = insertvalue %struct.short4 poison, i16 1, 0
+  %struct.irii1 = insertvalue %struct.short4 %struct.irii0, i16 %b, 1
+  %struct.irii2 = insertvalue %struct.short4 %struct.irii1, i16 3, 2
+  %struct.irii3 = insertvalue %struct.short4 %struct.irii2, i16 4, 3
+  call void @call_v4_i16(%struct.short4 %struct.irii3)
+
+  %struct.riii0 = insertvalue %struct.short4 poison, i16 %a, 0
+  %struct.riii1 = insertvalue %struct.short4 %struct.riii0, i16 2, 1
+  %struct.riii2 = insertvalue %struct.short4 %struct.riii1, i16 3, 2
+  %struct.riii3 = insertvalue %struct.short4 %struct.riii2, i16 4, 3
+  call void @call_v4_i16(%struct.short4 %struct.riii3)
+  ret void
+}
+
+; CHECK-LABEL: st_param_v4_i32
+; CHECK: st.param.v4.b32 [param0+0], {1, 2, 3, 4}
+; CHECK: st.param.v4.b32 [param0+0], {1, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}
+; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, 2, {{%r[0-9]+}}, {{%r[0-9]+}}}
+; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, 3, {{%r[0-9]+}}}
+; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, 4}
+; CHECK: st.param.v4.b32 [param0+0], {1, 2, {{%r[0-9]+}}, {{%r[0-9]+}}}
+; CHECK: st.param.v4.b32 [param0+0], {1, {{%r[0-9]+}}, 3, {{%r[0-9]+}}}
+; CHECK: st.param.v4.b32 [param0+0], {1, {{%r[0-9]+}}, {{%r[0-9]+}}, 4}
+; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, 2, 3, {{%r[0-9]+}}}
+; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, 2, {{%r[0-9]+}}, 4}
+; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, 3, 4}
+; CHECK: st.param.v4.b32 [param0+0], {1, 2, 3, {{%r[0-9]+}}}
+; CHECK: st.param.v4.b32 [param0+0], {1, 2, {{%r[0-9]+}}, 4}
+; CHECK: st.param.v4.b32 [param0+0], {1, {{%r[0-9]+}}, 3, 4}
+; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, 2, 3, 4}
+define void @st_param_v4_i32(i32 %a, i32 %b, i32 %c, i32 %d) {
+  call void @call_v4_i32(%struct.int4 { i32 1, i32 2, i32 3, i32 4 })
+
+  %struct.irrr0 = insertvalue %struct.int4 poison, i32 1, 0
+  %struct.irrr1 = insertvalue %struct.int4 %struct.irrr0, i32 %b, 1
+  %struct.irrr2 = insertvalue %struct.int4 %struct.irrr1, i32 %c, 2
+  %struct.irrr3 = insertvalue %struct.int4 %struct.irrr2, i32 %d, 3
+  call void @call_v4_i32(%struct.int4 %struct.irrr3)
+
+  %struct.rirr0 = insertvalue %struct.int4 poison, i32 %a, 0
+  %struct.rirr1 = insertvalue %struct.int4 %struct.rirr0, i32 2, 1
+  %struct.rirr2 = insertvalue %struct.int4 %struct.rirr1, i32 %c, 2
+  %struct.rirr3 = insertvalue %struct.int4 %struct.rirr2, i32 %d, 3
+  call void @call_v4_i32(%struct.int4 %struct.rirr3)
+
+  %struct.rrir0 = insertvalue %struct.int4 poison, i32 %a, 0
+  %struct.rrir1 = insertvalue %struct.int4 %struct.rrir0, i32 %b, 1
+  %struct.rrir2 = insertvalue %struct.int4 %struct.rrir1, i32 3, 2
+  %struct.rrir3 = insertvalue %struct.int4 %struct.rrir2, i32 %d, 3
+  call void @call_v4_i32(%struct.int4 %struct.rrir3)
+
+  %struct.rrri0 = insertvalue %struct.int4 poison, i32 %a, 0
+  %struct.rrri1 = insertvalue %struct.int4 %struct.rrri0, i32 %b, 1
+  %struct.rrri2 = insertvalue %struct.int4 %struct.rrri1, i32 %c, 2
+  %struct.rrri3 = insertvalue %struct.int4 %struct.rrri2, i32 4, 3
+  call void @call_v4_i32(%struct.int4 %struct.rrri3)
+
+  %struct.iirr0 = insertvalue %struct.int4 poison, i32 1, 0
+  %struct.iirr1 = insertvalue %struct.int4 %struct.iirr0, i32 2, 1
+  %struct.iirr2 = insertvalue %struct.int4 %struct.iirr1, i32 %c, 2
+  %struct.iirr3 = insertvalue %struct.int4 %struct.iirr2, i32 %d, 3
+  call void @call_v4_i32(%struct.int4 %struct.iirr3)
+
+  %struct.irir0 = insertvalue %struct.int4 poison, i32 1, 0
+  %struct.irir1 = insertvalue %struct.int4 %struct.irir0, i32 %b, 1
+  %struct.irir2 = insertvalue %struct.int4 %struct.irir1, i32 3, 2
+  %struct.irir3 = insertvalue %struct.int4 %struct.irir2, i32 %d, 3
+  call void @call_v4_i32(%struct.int4 %struct.irir3)
+
+  %struct.irri0 = insertvalue %struct.int4 poison, i32 1, 0
+  %struct.irri1 = insertvalue %struct.int4 %struct.irri0, i32 %b, 1
+  %struct.irri2 = insertvalue %struct.int4 %struct.irri1, i32 %c, 2
+  %struct.irri3 = insertvalue %struct.int4 %struct.irri2, i32 4, 3
+  call void @call_v4_i32(%struct.int4 %struct.irri3)
+
+  %struct.riir0 = insertvalue %struct.int4 poison, i32 %a, 0
+  %struct.riir1 = insertvalue %struct.int4 %struct.riir0, i32 2, 1
+  %struct.riir2 = insertvalue %struct.int4 %struct.riir1, i32 3, 2
+  %struct.riir3 = insertvalue %struct.int4 %struct.riir2, i32 %d, 3
+  call void @call_v4_i32(%struct.int4 %struct.riir3)
+
+  %struct.riri0 = insertvalue %struct.int4 poison, i32 %a, 0
+  %struct.riri1 = insertvalue %struct.int4 %struct.riri0, i32 2, 1
+  %struct.riri2 = insertvalue %struct.int4 %struct.riri1, i32 %c, 2
+  %struct.riri3 = insertvalue %struct.int4 %struct.riri2, i32 4, 3
+  call void @call_v4_i32(%struct.int4 %struct.riri3)
+
+  %struct.rrii0 = insertvalue %struct.int4 poison, i32 %a, 0
+  %struct.rrii1 = insertvalue %struct.int4 %struct.rrii0, i32 %b, 1
+  %struct.rrii2 = insertvalue %struct.int4 %struct.rrii1, i32 3, 2
+  %struct.rrii3 = insertvalue %struct.int4 %struct.rrii2, i32 4, 3
+  call void @call_v4_i32(%struct.int4 %struct.rrii3)
+
+  %struct.iiir0 = insertvalue %struct.int4 poison, i32 1, 0
+  %struct.iiir1 = insertvalue %struct.int4 %struct.iiir0, i32 2, 1
+  %struct.iiir2 = insertvalue %struct.int4 %struct.iiir1, i32 3, 2
+  %struct.iiir3 = insertvalue %struct.int4 %struct.iiir2, i32 %d, 3
+  call void @call_v4_i32(%struct.int4 %struct.iiir3)
+
+  %struct.iiri0 = insertvalue %struct.int4 poison, i32 1, 0
+  %struct.iiri1 = insertvalue %struct.int4 %struct.iiri0, i32 2, 1
+  %struct.iiri2 = insertvalue %struct.int4 %struct.iiri1, i32 %c, 2
+  %struct.iiri3 = insertvalue %struct.int4 %struct.iiri2, i32 4, 3
+  call void @call_v4_i32(%struct.int4 %struct.iiri3)
+
+  %struct.irii0 = insertvalue %struct.int4 poison, i32 1, 0
+  %struct.irii1 = insertvalue %struct.int4 %struct.irii0, i32 %b, 1
+  %struct.irii2 = insertvalue %struct.int4 %struct.irii1, i32 3, 2
+  %struct.irii3 = insertvalue %struct.int4 %struct.irii2, i32 4, 3
+  call void @call_v4_i32(%struct.int4 %struct.irii3)
+
+  %struct.riii0 = insertvalue %struct.int4 poison, i32 %a, 0
+  %struct.riii1 = insertvalue %struct.int4 %struct.riii0, i32 2, 1
+  %struct.riii2 = insertvalue %struct.int4 %struct.riii1, i32 3, 2
+  %struct.riii3 = insertvalue %struct.int4 %struct.riii2, i32 4, 3
+  call void @call_v4_i32(%struct.int4 %struct.riii3)
+  ret void
+}
+
+; CHECK-LABEL: st_param_v4_f32
+; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, 0f40400000, 0f40800000}
+; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, {{%f[0-9]+}}, {{%f[0-9]+}}, {{%f[0-9]+}}}
+; CHECK: st.param.v4.f32 [param0+0], {{{%f[0-9]+}}, 0f40000000, {{%f[0-9]+}}, {{%f[0-9]+}}}
+; CHECK: st.param.v4.f32 [param0+0], {{{%f[0-9]+}}, {{%f[0-9]+}}, 0f40400000, {{%f[0-9]+}}}
+; CHECK: st.param.v4.f32 [param0+0], {{{%f[0-9]+}}, {{%f[0-9]+}}, {{%f[0-9]+}}, 0f40800000}
+; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, {{%f[0-9]+}}, {{%f[0-9]+}}}
+; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, {{%f[0-9]+}}, 0f40400000, {{%f[0-9]+}}}
+; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, {{%f[0-9]+}}, {{%f[0-9]+}}, 0f40800000}
+; CHECK: st.param.v4.f32 [param0+0], {{{%f[0-9]+}}, 0f40000000, 0f40400000, {{%f[0-9]+}}}
+; CHECK: st.param.v4.f32 [param0+0], {{{%f[0-9]+}}, 0f40000000, {{%f[0-9]+}}, 0f40800000}
+; CHECK: st.param.v4.f32 [param0+0], {{{%f[0-9]+}}, {{%f[0-9]+}}, 0f40400000, 0f40800000}
+; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, 0f40400000, {{%f[0-9]+}}}
+; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, {{%f[0-9]+}}, 0f40800000}
+; CHECK: st.param.v4.f32 [param0+0], {0f3F800000, {{%f[0-9]+}}, 0f40400000, 0f40800000}
+; CHECK: st.param.v4.f32 [param0+0], {{{%f[0-9]+}}, 0f40000000, 0f40400000, 0f40800000}
+define void @st_param_v4_f32(float %a, float %b, float %c, float %d) {
+  call void @call_v4_f32(%struct.float4 { float 1.0, float 2.0, float 3.0, float 4.0 })
+
+  %struct.irrr0 = insertvalue %struct.float4 poison, float 1.0, 0
+  %struct.irrr1 = insertvalue %struct.float4 %struct.irrr0, float %b, 1
+  %struct.irrr2 = insertvalue %struct.float4 %struct.irrr1, float %c, 2
+  %struct.irrr3 = insertvalue %struct.float4 %struct.irrr2, float %d, 3
+  call void @call_v4_f32(%struct.float4 %struct.irrr3)
+
+  %struct.rirr0 = insertvalue %struct.float4 poison, float %a, 0
+  %struct.rirr1 = insertvalue %struct.float4 %struct.rirr0, float 2.0, 1
+  %struct.rirr2 = insertvalue %struct.float4 %struct.rirr1, float %c, 2
+  %struct.rirr3 = insertvalue %struct.float4 %struct.rirr2, float %d, 3
+  call void @call_v4_f32(%struct.float4 %struct.rirr3)
+
+  %struct.rrir0 = insertvalue %struct.float4 poison, float %a, 0
+  %struct.rrir1 = insertvalue %struct.float4 %struct.rrir0, float %b, 1
+  %struct.rrir2 = insertvalue %struct.float4 %struct.rrir1, float 3.0, 2
+  %struct.rrir3 = insertvalue %struct.float4 %struct.rrir2, float %d, 3
+  call void @call_v4_f32(%struct.float4 %struct.rrir3)
+
+  %struct.rrri0 = insertvalue %struct.float4 poison, float %a, 0
+  %struct.rrri1 = insertvalue %struct.float4 %struct.rrri0, float %b, 1
+  %struct.rrri2 = insertvalue %struct.float4 %struct.rrri1, float %c, 2
+  %struct.rrri3 = insertvalue %struct.float4 %struct.rrri2, float 4.0, 3
+  call void @call_v4_f32(%struct.float4 %struct.rrri3)
+
+  %struct.iirr0 = insertvalue %struct.float4 poison, float 1.0, 0
+  %struct.iirr1 = insertvalue %struct.float4 %struct.iirr0, float 2.0, 1
+  %struct.iirr2 = insertvalue %struct.float4 %struct.iirr1, float %c, 2
+  %struct.iirr3 = insertvalue %struct.float4 %struct.iirr2, float %d, 3
+  call void @call_v4_f32(%struct.float4 %struct.iirr3)
+
+  %struct.irir0 = insertvalue %struct.float4 poison, float 1.0, 0
+  %struct.irir1 = insertvalue %struct.float4 %struct.irir0, float %b, 1
+  %struct.irir2 = insertvalue %struct.float4 %struct.irir1, float 3.0, 2
+  %struct.irir3 = insertvalue %struct.float4 %struct.irir2, float %d, 3
+  call void @call_v4_f32(%struct.float4 %struct.irir3)
+
+  %struct.irri0 = insertvalue %struct.float4 poison, float 1.0, 0
+  %struct.irri1 = insertvalue %struct.float4 %struct.irri0, float %b, 1
+  %struct.irri2 = insertvalue %struct.float4 %struct.irri1, float %c, 2
+  %struct.irri3 = insertvalue %struct.float4 %struct.irri2, float 4.0, 3
+  call void @call_v4_f32(%struct.float4 %struct.irri3)
+
+  %struct.riir0 = insertvalue %struct.float4 poison, float %a, 0
+  %struct.riir1 = insertvalue %struct.float4 %struct.riir0, float 2.0, 1
+  %struct.riir2 = insertvalue %struct.float4 %struct.riir1, float 3.0, 2
+  %struct.riir3 = insertvalue %struct.float4 %struct.riir2, float %d, 3
+  call void @call_v4_f32(%struct.float4 %struct.riir3)
+
+  %struct.riri0 = insertvalue %struct.float4 poison, float %a, 0
+  %struct.riri1 = insertvalue %struct.float4 %struct.riri0, float 2.0, 1
+  %struct.riri2 = insertvalue %struct.float4 %struct.riri1, float %c, 2
+  %struct.riri3 = insertvalue %struct.float4 %struct.riri2, float 4.0, 3
+  call void @call_v4_f32(%struct.float4 %struct.riri3)
+
+  %struct.rrii0 = insertvalue %struct.float4 poison, float %a, 0
+  %struct.rrii1 = insertvalue %struct.float4 %struct.rrii0, float %b, 1
+  %struct.rrii2 = insertvalue %struct.float4 %struct.rrii1, float 3.0, 2
+  %struct.rrii3 = insertvalue %struct.float4 %struct.rrii2, float 4.0, 3
+  call void @call_v4_f32(%struct.float4 %struct.rrii3)
+
+  %struct.iiir0 = insertvalue %struct.float4 poison, float 1.0, 0
+  %struct.iiir1 = insertvalue %struct.float4 %struct.iiir0, float 2.0, 1
+  %struct.iiir2 = insertvalue %struct.float4 %struct.iiir1, float 3.0, 2
+  %struct.iiir3 = insertvalue %struct.float4 %struct.iiir2, float %d, 3
+  call void @call_v4_f32(%struct.float4 %struct.iiir3)
+
+  %struct.iiri0 = insertvalue %struct.float4 poison, float 1.0, 0
+  %struct.iiri1 = insertvalue %struct.float4 %struct.iiri0, float 2.0, 1
+  %struct.iiri2 = insertvalue %struct.float4 %struct.iiri1, float %c, 2
+  %struct.iiri3 = insertvalue %struct.float4 %struct.iiri2, float 4.0, 3
+  call void @call_v4_f32(%struct.float4 %struct.iiri3)
+
+  %struct.irii0 = insertvalue %struct.float4 poison, float 1.0, 0
+  %struct.irii1 = insertvalue %struct.float4 %struct.irii0, float %b, 1
+  %struct.irii2 = insertvalue %struct.float4 %struct.irii1, float 3.0, 2
+  %struct.irii3 = insertvalue %struct.float4 %struct.irii2, float 4.0, 3
+  call void @call_v4_f32(%struct.float4 %struct.irii3)
+
+  %struct.riii0 = insertvalue %struct.float4 poison, float %a, 0
+  %struct.riii1 = insertvalue %struct.float4 %struct.riii0, float 2.0, 1
+  %struct.riii2 = insertvalue %struct.float4 %struct.riii1, float 3.0, 2
+  %struct.riii3 = insertvalue %struct.float4 %struct.riii2, float 4.0, 3
+  call void @call_v4_f32(%struct.float4 %struct.riii3)
+  ret void
+}
+
+declare void @call_v4_i8(%struct.char4)
+declare void @call_v4_i16(%struct.short4)
+declare void @call_v4_i32(%struct.int4)
+declare void @call_v4_f32(%struct.float4)
+
+!nvvm.annotations = !{!1, !2, !3, !4, !5, !6, !7, !8, !9, !10}
+!1 = !{ptr @call_v2_i8, !"align", i32 65538}
+!2 = !{ptr @call_v2_i16, !"align", i32 65540}
+!3 = !{ptr @call_v2_i32, !"align", i32 65544}
+!4 = !{ptr @call_v2_i64, !"align", i32 65552}
+!5 = !{ptr @call_v2_f32, !"align", i32 65544}
+!6 = !{ptr @call_v2_f64, !"align", i32 65552}
+!7 = !{ptr @call_v4_i8, !"align", i32 65540}
+!8 = !{ptr @call_v4_i16, !"align", i32 65544}
+!9 = !{ptr @call_v4_i32, !"align", i32 65552}
+!10 = !{ptr @call_v4_f32, !"align", i32 65552}



More information about the llvm-commits mailing list