[llvm] 590c968 - [NVPTX] fixup support for unaligned parameters and returns (#82562)

Thu Feb 22 17:27:32 PST 2024

Author: Alex MacLean
Date: 2024-02-22T17:27:28-08:00
New Revision: 590c968e7943e51bb00ff75d312435f24d983b2a

URL: https://github.com/llvm/llvm-project/commit/590c968e7943e51bb00ff75d312435f24d983b2a
DIFF: https://github.com/llvm/llvm-project/commit/590c968e7943e51bb00ff75d312435f24d983b2a.diff

LOG: [NVPTX] fixup support for unaligned parameters and returns (#82562)

Add support for unaligned parameters and return values. These must be
loaded and stored one byte at a time and then bit manipulation is used
to assemble the correct final result.

Added: 
    llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll

Modified: 
    llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
    llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
    llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
    llvm/test/CodeGen/NVPTX/param-load-store.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ded2f2584014d9..3ff8994602e16b 100644

--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -2135,6 +2135,21 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
                              NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
                              NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
                              NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
+    if (Opcode == NVPTX::StoreRetvalI8) {
+      // Fine tune the opcode depending on the size of the operand.
+      // This helps to avoid creating redundant COPY instructions in
+      // InstrEmitter::AddRegisterOperand().
+      switch (Ops[0].getSimpleValueType().SimpleTy) {
+      default:
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::StoreRetvalI8TruncI32;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::StoreRetvalI8TruncI64;
+        break;
+      }
+    }
     break;
   case 2:
     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
@@ -2211,6 +2226,21 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
                                NVPTX::StoreParamI8, NVPTX::StoreParamI16,
                                NVPTX::StoreParamI32, NVPTX::StoreParamI64,
                                NVPTX::StoreParamF32, NVPTX::StoreParamF64);
+      if (Opcode == NVPTX::StoreParamI8) {
+        // Fine tune the opcode depending on the size of the operand.
+        // This helps to avoid creating redundant COPY instructions in
+        // InstrEmitter::AddRegisterOperand().
+        switch (Ops[0].getSimpleValueType().SimpleTy) {
+        default:
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::StoreParamI8TruncI32;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::StoreParamI8TruncI64;
+          break;
+        }
+      }
       break;
     case 2:
       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,

diff  --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7d2fe78d142292..66a101036f9134 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -47,6 +47,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
@@ -59,6 +60,7 @@
 #include <cmath>
 #include <cstdint>
 #include <iterator>
+#include <optional>
 #include <sstream>
 #include <string>
 #include <utility>
@@ -1529,6 +1531,105 @@ Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
   return DL.getABITypeAlign(Ty);
 }
 
+static bool adjustElementType(EVT &ElementType) {
+  switch (ElementType.getSimpleVT().SimpleTy) {
+  default:
+    return false;
+  case MVT::f16:
+  case MVT::bf16:
+    ElementType = MVT::i16;
+    return true;
+  case MVT::f32:
+  case MVT::v2f16:
+  case MVT::v2bf16:
+    ElementType = MVT::i32;
+    return true;
+  case MVT::f64:
+    ElementType = MVT::i64;
+    return true;
+  }
+}
+
+// Use byte-store when the param address of the argument value is unaligned.
+// This may happen when the return value is a field of a packed structure.
+//
+// This is called in LowerCall() when passing the param values.
+static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain,
+                                        uint64_t Offset, EVT ElementType,
+                                        SDValue StVal, SDValue &InGlue,
+                                        unsigned ArgID, const SDLoc &dl) {
+  // Bit logic only works on integer types
+  if (adjustElementType(ElementType))
+    StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal);
+
+  // Store each byte
+  SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
+    // Shift the byte to the last byte position
+    SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal,
+                                   DAG.getConstant(i * 8, dl, MVT::i32));
+    SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32),
+                               DAG.getConstant(Offset + i, dl, MVT::i32),
+                               ShiftVal, InGlue};
+    // Trunc store only the last byte by using
+    //     st.param.b8
+    // The register type can be larger than b8.
+    Chain = DAG.getMemIntrinsicNode(
+        NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8,
+        MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
+    InGlue = Chain.getValue(1);
+  }
+  return Chain;
+}
+
+// Use byte-load when the param adress of the returned value is unaligned.
+// This may happen when the returned value is a field of a packed structure.
+static SDValue
+LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset,
+                           EVT ElementType, SDValue &InGlue,
+                           SmallVectorImpl<SDValue> &TempProxyRegOps,
+                           const SDLoc &dl) {
+  // Bit logic only works on integer types
+  EVT MergedType = ElementType;
+  adjustElementType(MergedType);
+
+  // Load each byte and construct the whole value. Initial value to 0
+  SDValue RetVal = DAG.getConstant(0, dl, MergedType);
+  // LoadParamMemI8 loads into i16 register only
+  SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue);
+  for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
+    SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
+                              DAG.getConstant(Offset + i, dl, MVT::i32),
+                              InGlue};
+    // This will be selected to LoadParamMemI8
+    SDValue LdVal =
+        DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands,
+                                MVT::i8, MachinePointerInfo(), Align(1));
+    SDValue TmpLdVal = LdVal.getValue(0);
+    Chain = LdVal.getValue(1);
+    InGlue = LdVal.getValue(2);
+
+    TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl,
+                           TmpLdVal.getSimpleValueType(), TmpLdVal);
+    TempProxyRegOps.push_back(TmpLdVal);
+
+    SDValue CMask = DAG.getConstant(255, dl, MergedType);
+    SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32);
+    // Need to extend the i16 register to the whole width.
+    TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal);
+    // Mask off the high bits. Leave only the lower 8bits.
+    // Do this because we are using loadparam.b8.
+    TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask);
+    // Shift and merge
+    TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift);
+    RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal);
+  }
+  if (ElementType != MergedType)
+    RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
+
+  return RetVal;
+}
+
 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                        SmallVectorImpl<SDValue> &InVals) const {
 
@@ -1680,17 +1781,6 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       if (NeedAlign)
         PartAlign = commonAlignment(ArgAlign, CurOffset);
 
-      // New store.
-      if (VectorInfo[j] & PVF_FIRST) {
-        assert(StoreOperands.empty() && "Unfinished preceding store.");
-        StoreOperands.push_back(Chain);
-        StoreOperands.push_back(
-            DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
-        StoreOperands.push_back(DAG.getConstant(
-            IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
-            dl, MVT::i32));
-      }
-
       SDValue StVal = OutVals[OIdx];
 
       MVT PromotedVT;
@@ -1723,6 +1813,35 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
       }
 
+      // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
+      // scalar store. In such cases, fall back to byte stores.
+      if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() &&
+          PartAlign.value() <
+              DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) {
+        assert(StoreOperands.empty() && "Unfinished preceeding store.");
+        Chain = LowerUnalignedStoreParam(
+            DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT,
+            StVal, InGlue, ParamCount, dl);
+
+        // LowerUnalignedStoreParam took care of inserting the necessary nodes
+        // into the SDAG, so just move on to the next element.
+        if (!IsByVal)
+          ++OIdx;
+        continue;
+      }
+
+      // New store.
+      if (VectorInfo[j] & PVF_FIRST) {
+        assert(StoreOperands.empty() && "Unfinished preceding store.");
+        StoreOperands.push_back(Chain);
+        StoreOperands.push_back(
+            DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
+
+        StoreOperands.push_back(DAG.getConstant(
+            IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
+            dl, MVT::i32));
+      }
+
       // Record the value to store.
       StoreOperands.push_back(StVal);
 
@@ -1923,6 +2042,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   SmallVector<SDValue, 16> ProxyRegOps;
   SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
+  // An item of the vector is filled if the element does not need a ProxyReg
+  // operation on it and should be added to InVals as is. ProxyRegOps and
+  // ProxyRegTruncates contain empty/none items at the same index.
+  SmallVector<SDValue, 16> RetElts;
+  // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`
+  // to use the values of `LoadParam`s and to be replaced later then
+  // `CALLSEQ_END` is added.
+  SmallVector<SDValue, 16> TempProxyRegOps;
 
   // Generate loads from param memory/moves from registers for result
   if (Ins.size() > 0) {
@@ -1966,6 +2093,22 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         EltType = MVT::i16;
       }
 
+      // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
+      // scalar load. In such cases, fall back to byte loads.
+      if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() &&
+          EltAlign < DL.getABITypeAlign(
+                         TheLoadType.getTypeForEVT(*DAG.getContext()))) {
+        assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
+        SDValue Ret = LowerUnalignedLoadRetParam(
+            DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl);
+        ProxyRegOps.push_back(SDValue());
+        ProxyRegTruncates.push_back(std::optional<MVT>());
+        RetElts.resize(i);
+        RetElts.push_back(Ret);
+
+        continue;
+      }
+
       // Record index of the very first element of the vector.
       if (VectorInfo[i] & PVF_FIRST) {
         assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
@@ -2028,6 +2171,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // will not get lost. Otherwise, during libcalls expansion, the nodes can become
   // dangling.
   for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
+    if (i < RetElts.size() && RetElts[i]) {
+      InVals.push_back(RetElts[i]);
+      continue;
+    }
+
     SDValue Ret = DAG.getNode(
       NVPTXISD::ProxyReg, dl,
       DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
@@ -2044,6 +2192,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     InVals.push_back(Ret);
   }
 
+  for (SDValue &T : TempProxyRegOps) {
+    SDValue Repl = DAG.getNode(
+        NVPTXISD::ProxyReg, dl,
+        DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue),
+        {Chain, T.getOperand(0), InGlue});
+    DAG.ReplaceAllUsesWith(T, Repl);
+    DAG.RemoveDeadNode(T.getNode());
+
+    Chain = Repl.getValue(1);
+    InGlue = Repl.getValue(2);
+  }
+
   // set isTailCall to false for now, until we figure out how to express
   // tail call optimization in PTX
   isTailCall = false;
@@ -3045,9 +3205,20 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
                           DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
           Value *srcValue = Constant::getNullValue(PointerType::get(
               EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
+
+          const MaybeAlign PartAlign = [&]() -> MaybeAlign {
+            if (aggregateIsPacked)
+              return Align(1);
+            if (NumElts != 1)
+              return std::nullopt;
+            Align PartAlign =
+                (Offsets[parti] == 0 && PAL.getParamAlignment(i))
+                    ? PAL.getParamAlignment(i).value()
+                    : DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext()));
+            return commonAlignment(PartAlign, Offsets[parti]);
+          }();
           SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
-                                  MachinePointerInfo(srcValue),
-                                  MaybeAlign(aggregateIsPacked ? 1 : 0),
+                                  MachinePointerInfo(srcValue), PartAlign,
                                   MachineMemOperand::MODereferenceable |
                                       MachineMemOperand::MOInvariant);
           if (P.getNode())
@@ -3113,6 +3284,33 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
   return Chain;
 }
 
+// Use byte-store when the param adress of the return value is unaligned.
+// This may happen when the return value is a field of a packed structure.
+static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain,
+                                      uint64_t Offset, EVT ElementType,
+                                      SDValue RetVal, const SDLoc &dl) {
+  // Bit logic only works on integer types
+  if (adjustElementType(ElementType))
+    RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
+
+  // Store each byte
+  for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
+    // Shift the byte to the last byte position
+    SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal,
+                                   DAG.getConstant(i * 8, dl, MVT::i32));
+    SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32),
+                               ShiftVal};
+    // Trunc store only the last byte by using
+    //     st.param.b8
+    // The register type can be larger than b8.
+    Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
+                                    DAG.getVTList(MVT::Other), StoreOperands,
+                                    MVT::i8, MachinePointerInfo(), std::nullopt,
+                                    MachineMemOperand::MOStore);
+  }
+  return Chain;
+}
+
 SDValue
 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                  bool isVarArg,
@@ -3162,13 +3360,6 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
   SmallVector<SDValue, 6> StoreOperands;
   for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
-    // New load/store. Record chain and offset operands.
-    if (VectorInfo[i] & PVF_FIRST) {
-      assert(StoreOperands.empty() && "Orphaned operand list.");
-      StoreOperands.push_back(Chain);
-      StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
-    }
-
     SDValue OutVal = OutVals[i];
     SDValue RetVal = PromotedOutVals[i];
 
@@ -3182,6 +3373,32 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
     }
 
+    // If we have a PVF_SCALAR entry, it may not even be sufficiently aligned
+    // for a scalar store. In such cases, fall back to byte stores.
+    if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) {
+      EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
+      Align ElementTypeAlign =
+          DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext()));
+      Align ElementAlign =
+          commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]);
+      if (ElementAlign < ElementTypeAlign) {
+        assert(StoreOperands.empty() && "Orphaned operand list.");
+        Chain = LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType,
+                                       RetVal, dl);
+
+        // The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes
+        // into the graph, so just move on to the next element.
+        continue;
+      }
+    }
+
+    // New load/store. Record chain and offset operands.
+    if (VectorInfo[i] & PVF_FIRST) {
+      assert(StoreOperands.empty() && "Orphaned operand list.");
+      StoreOperands.push_back(Chain);
+      StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
+    }
+
     // Record the value to return.
     StoreOperands.push_back(RetVal);
 

diff  --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 55a1955a7f497e..b3517ce066b87a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2738,6 +2738,8 @@ def StoreParamI32    : StoreParamInst<Int32Regs, ".b32">;
 
 def StoreParamI16    : StoreParamInst<Int16Regs, ".b16">;
 def StoreParamI8     : StoreParamInst<Int16Regs, ".b8">;
+def StoreParamI8TruncI32 : StoreParamInst<Int32Regs, ".b8">;
+def StoreParamI8TruncI64 : StoreParamInst<Int64Regs, ".b8">;
 def StoreParamV2I64  : StoreParamV2Inst<Int64Regs, ".b64">;
 def StoreParamV2I32  : StoreParamV2Inst<Int32Regs, ".b32">;
 def StoreParamV2I16  : StoreParamV2Inst<Int16Regs, ".b16">;
@@ -2757,6 +2759,8 @@ def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
 def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
 def StoreRetvalI16    : StoreRetvalInst<Int16Regs, ".b16">;
 def StoreRetvalI8     : StoreRetvalInst<Int16Regs, ".b8">;
+def StoreRetvalI8TruncI32 : StoreRetvalInst<Int32Regs, ".b8">;
+def StoreRetvalI8TruncI64 : StoreRetvalInst<Int64Regs, ".b8">;
 def StoreRetvalV2I64  : StoreRetvalV2Inst<Int64Regs, ".b64">;
 def StoreRetvalV2I32  : StoreRetvalV2Inst<Int32Regs, ".b32">;
 def StoreRetvalV2I16  : StoreRetvalV2Inst<Int16Regs, ".b16">;

diff  --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll
index c14dc88431d316..a29d4e1875cd7b 100644
--- a/llvm/test/CodeGen/NVPTX/param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll
@@ -1135,31 +1135,86 @@ define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) {
 ; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+2];
 ; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+1];
 ; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0];
-; --- TODO
-; --- Unaligned parameter store/ return value load is broken in both nvcc
-; --- and llvm and needs to be fixed.
 ; CHECK:        .param .align 1 .b8 param0[25];
-; CHECK-DAG:        st.param.b32    [param0+0],
-; CHECK-DAG:        st.param.b32    [param0+4],
+; CHECK-DAG:        st.param.b8     [param0+0],
+; CHECK-DAG:        st.param.b8     [param0+1],
+; CHECK-DAG:        st.param.b8     [param0+2],
+; CHECK-DAG:        st.param.b8     [param0+3],
+; CHECK-DAG:        st.param.b8     [param0+4],
+; CHECK-DAG:        st.param.b8     [param0+5],
+; CHECK-DAG:        st.param.b8     [param0+6],
+; CHECK-DAG:        st.param.b8     [param0+7],
 ; CHECK-DAG:        st.param.b8     [param0+8],
-; CHECK-DAG:        st.param.b32    [param0+9],
-; CHECK-DAG:        st.param.b32    [param0+13],
-; CHECK-DAG:        st.param.b64    [param0+17],
+; CHECK-DAG:        st.param.b8     [param0+9],
+; CHECK-DAG:        st.param.b8     [param0+10],
+; CHECK-DAG:        st.param.b8     [param0+11],
+; CHECK-DAG:        st.param.b8     [param0+12],
+; CHECK-DAG:        st.param.b8     [param0+13],
+; CHECK-DAG:        st.param.b8     [param0+14],
+; CHECK-DAG:        st.param.b8     [param0+15],
+; CHECK-DAG:        st.param.b8     [param0+16],
+; CHECK-DAG:        st.param.b8     [param0+17],
+; CHECK-DAG:        st.param.b8     [param0+18],
+; CHECK-DAG:        st.param.b8     [param0+19],
+; CHECK-DAG:        st.param.b8     [param0+20],
+; CHECK-DAG:        st.param.b8     [param0+21],
+; CHECK-DAG:        st.param.b8     [param0+22],
+; CHECK-DAG:        st.param.b8     [param0+23],
+; CHECK-DAG:        st.param.b8     [param0+24],
 ; CHECK:            .param .align 1 .b8 retval0[25];
 ; CHECK:            call.uni (retval0),
 ; CHECK-NEXT:       test_s_i1i32x4p,
-; CHECK-DAG:        ld.param.b32    %r41, [retval0+0];
-; CHECK-DAG:        ld.param.b32    %r42, [retval0+4];
-; CHECK-DAG:        ld.param.b8     %rs2, [retval0+8];
-; CHECK-DAG:        ld.param.b32    %r43, [retval0+9];
-; CHECK-DAG:        ld.param.b32    %r44, [retval0+13];
-; CHECK-DAG:        ld.param.b64    %rd23, [retval0+17];
-; CHECK-DAG:        st.param.b32    [func_retval0+0],
-; CHECK-DAG:        st.param.b32    [func_retval0+4],
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+0];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+1];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+2];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+3];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+4];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+5];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+6];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+7];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+8];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+9];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+10];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+11];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+12];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+13];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+14];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+15];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+16];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+17];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+18];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+19];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+20];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+21];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+22];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+23];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+24];
+; CHECK:            } // callseq
+; CHECK-DAG:        st.param.b8     [func_retval0+0],
+; CHECK-DAG:        st.param.b8     [func_retval0+1],
+; CHECK-DAG:        st.param.b8     [func_retval0+2],
+; CHECK-DAG:        st.param.b8     [func_retval0+3],
+; CHECK-DAG:        st.param.b8     [func_retval0+4],
+; CHECK-DAG:        st.param.b8     [func_retval0+5],
+; CHECK-DAG:        st.param.b8     [func_retval0+6],
+; CHECK-DAG:        st.param.b8     [func_retval0+7],
 ; CHECK-DAG:        st.param.b8     [func_retval0+8],
-; CHECK-DAG:        st.param.b32    [func_retval0+9],
-; CHECK-DAG:        st.param.b32    [func_retval0+13],
-; CHECK-DAG:        st.param.b64    [func_retval0+17],
+; CHECK-DAG:        st.param.b8     [func_retval0+9],
+; CHECK-DAG:        st.param.b8     [func_retval0+10],
+; CHECK-DAG:        st.param.b8     [func_retval0+11],
+; CHECK-DAG:        st.param.b8     [func_retval0+12],
+; CHECK-DAG:        st.param.b8     [func_retval0+13],
+; CHECK-DAG:        st.param.b8     [func_retval0+14],
+; CHECK-DAG:        st.param.b8     [func_retval0+15],
+; CHECK-DAG:        st.param.b8     [func_retval0+16],
+; CHECK-DAG:        st.param.b8     [func_retval0+17],
+; CHECK-DAG:        st.param.b8     [func_retval0+18],
+; CHECK-DAG:        st.param.b8     [func_retval0+19],
+; CHECK-DAG:        st.param.b8     [func_retval0+20],
+; CHECK-DAG:        st.param.b8     [func_retval0+21],
+; CHECK-DAG:        st.param.b8     [func_retval0+22],
+; CHECK-DAG:        st.param.b8     [func_retval0+23],
+; CHECK-DAG:        st.param.b8     [func_retval0+24],
 
 define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) {
        %r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a);

diff  --git a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
new file mode 100644
index 00000000000000..40a3e9e945a23e
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
@@ -0,0 +1,385 @@
+; Verifies correctness of load/store of parameters and return values.
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | %ptxas-verify %}
+
+%s_i8i16p = type { <{ i16, i8, i16 }>, i64 }
+%s_i8i32p = type { <{ i32, i8, i32 }>, i64 }
+%s_i8i64p = type { <{ i64, i8, i64 }>, i64 }
+%s_i8f16p = type { <{ half, i8, half }>, i64 }
+%s_i8f16x2p = type { <{ <2 x half>, i8, <2 x half> }>, i64 }
+%s_i8f32p = type { <{ float, i8, float }>, i64 }
+%s_i8f64p = type { <{ double, i8, double }>, i64 }
+
+; -- All loads/stores from parameters aligned by one must be done one
+;    byte at a time.
+; -- Notes:
+;   -- There are two fields of interest in the packed part of the struct, one
+;      with a proper offset and one without. The former should be loaded or
+;      stored as a whole, and the latter by bytes.
+;   -- Only loading and storing the said fields are checked in the following
+;      series of tests so that they are more concise.
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[16])
+; CHECK-LABEL: test_s_i8i16p(
+; CHECK:        .param .align 8 .b8 test_s_i8i16p_param_0[16]
+; CHECK-DAG:    ld.param.u16 [[P0:%rs[0-9]+]],   [test_s_i8i16p_param_0];
+; CHECK-DAG:    ld.param.u8 [[P2_0:%rs[0-9]+]],   [test_s_i8i16p_param_0+3];
+; CHECK-DAG:    ld.param.u8 [[P2_1:%rs[0-9]+]],   [test_s_i8i16p_param_0+4];
+; CHECK-DAG:    shl.b16     [[P2_1_shl:%rs[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    or.b16      [[P2_1_or:%rs[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK:        { // callseq
+; CHECK:        .param .align 8 .b8 param0[16];
+; CHECK-DAG:    st.param.b16 [param0+0], [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+3], [[P2_1_or]];
+; CHECK-DAG:    st.param.b8  [param0+4], [[P2_1]];
+; CHECK:        .param .align 8 .b8 retval0[16];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8i16p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.b16 [[R0:%rs[0-9]+]],   [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+3];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+4];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.b16 [func_retval0+0], [[R0]];
+; CHECK-DAG:    shl.b16      [[R2_1_shl:%rs[0-9]+]], [[R2_1]], 8;
+; CHECK-DAG:    and.b16      [[R2_0_and:%rs[0-9]+]], [[R2_0]], 255;
+; CHECK-DAG:    or.b16       [[R2:%rs[0-9]+]], [[R2_0_and]], [[R2_1_shl]];
+; CHECK-DAG:    st.param.b8  [func_retval0+3], [[R2]];
+; CHECK-DAG:    and.b16      [[R2_1_and:%rs[0-9]+]], [[R2_1]], 255;
+; CHECK-DAG:    st.param.b8  [func_retval0+4], [[R2_1_and]];
+; CHECK:        ret;
+
+define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) {
+       %r = tail call %s_i8i16p @test_s_i8i16p(%s_i8i16p %a)
+       ret %s_i8i16p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i8i32p(
+; CHECK:        .param .align 8 .b8 test_s_i8i32p_param_0[24]
+; CHECK-DAG:    ld.param.u32 [[P0:%r[0-9]+]],   [test_s_i8i32p_param_0];
+; CHECK-DAG:    ld.param.u8 [[P2_0:%r[0-9]+]],   [test_s_i8i32p_param_0+5];
+; CHECK-DAG:    ld.param.u8 [[P2_1:%r[0-9]+]],   [test_s_i8i32p_param_0+6];
+; CHECK-DAG:    ld.param.u8 [[P2_2:%r[0-9]+]],   [test_s_i8i32p_param_0+7];
+; CHECK-DAG:    ld.param.u8 [[P2_3:%r[0-9]+]],   [test_s_i8i32p_param_0+8];
+; CHECK-DAG:    shl.b32     [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    shl.b32     [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG:    shl.b32     [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG:    or.b32      [[P2_or:%r[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG:    or.b32      [[P2_or_1:%r[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG:    or.b32      [[P2:%r[0-9]+]], [[P2_or_1]], [[P2_or]];
+; CHECK-DAG:    shr.u32     [[P2_1_shr:%r[0-9]+]], [[P2]], 8;
+; CHECK-DAG:    shr.u32     [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16;
+; CHECK:        { // callseq
+; CHECK-DAG:    .param .align 8 .b8 param0[24];
+; CHECK-DAG:    st.param.b32 [param0+0], [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+5], [[P2]];
+; CHECK-DAG:    st.param.b8  [param0+6], [[P2_1_shr]];
+; CHECK-DAG:    st.param.b8  [param0+7], [[P2_2_shr]];
+; CHECK-DAG:    st.param.b8  [param0+8], [[P2_3]];
+; CHECK:        .param .align 8 .b8 retval0[24];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8i32p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.b32 [[R0:%r[0-9]+]],   [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+5];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+6];
+; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+7];
+; CHECK-DAG:    ld.param.b8  [[R2_3:%rs[0-9]+]], [retval0+8];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.b32 [func_retval0+0], [[R0]];
+; CHECK-DAG:    st.param.b8  [func_retval0+5],
+; CHECK-DAG:    st.param.b8  [func_retval0+6],
+; CHECK-DAG:    st.param.b8  [func_retval0+7],
+; CHECK-DAG:    st.param.b8  [func_retval0+8],
+; CHECK:        ret;
+
+define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) {
+       %r = tail call %s_i8i32p @test_s_i8i32p(%s_i8i32p %a)
+       ret %s_i8i32p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[32])
+; CHECK-LABEL: test_s_i8i64p(
+; CHECK:        .param .align 8 .b8 test_s_i8i64p_param_0[32]
+; CHECK-DAG:    ld.param.u64 [[P0:%rd[0-9]+]],   [test_s_i8i64p_param_0];
+; CHECK-DAG:    ld.param.u8 [[P2_0:%rd[0-9]+]],   [test_s_i8i64p_param_0+9];
+; CHECK-DAG:    ld.param.u8 [[P2_1:%rd[0-9]+]],   [test_s_i8i64p_param_0+10];
+; CHECK-DAG:    ld.param.u8 [[P2_2:%rd[0-9]+]],   [test_s_i8i64p_param_0+11];
+; CHECK-DAG:    ld.param.u8 [[P2_3:%rd[0-9]+]],   [test_s_i8i64p_param_0+12];
+; CHECK-DAG:    ld.param.u8 [[P2_4:%rd[0-9]+]],   [test_s_i8i64p_param_0+13];
+; CHECK-DAG:    ld.param.u8 [[P2_5:%rd[0-9]+]],   [test_s_i8i64p_param_0+14];
+; CHECK-DAG:    ld.param.u8 [[P2_6:%rd[0-9]+]],   [test_s_i8i64p_param_0+15];
+; CHECK-DAG:    ld.param.u8 [[P2_7:%rd[0-9]+]],   [test_s_i8i64p_param_0+16];
+; CHECK-DAG:    shl.b64      [[P2_1_shl:%rd[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    shl.b64      [[P2_2_shl:%rd[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG:    shl.b64      [[P2_3_shl:%rd[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG:    or.b64       [[P2_or_0:%rd[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG:    or.b64       [[P2_or_1:%rd[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG:    or.b64       [[P2_or_2:%rd[0-9]+]], [[P2_or_1]], [[P2_or_0]];
+; CHECK-DAG:    shl.b64 	 [[P2_5_shl:%rd[0-9]+]], [[P2_5]], 8;
+; CHECK-DAG:    shl.b64      [[P2_6_shl:%rd[0-9]+]], [[P2_6]], 16;
+; CHECK-DAG:    shl.b64      [[P2_7_shl:%rd[0-9]+]], [[P2_7]], 24;
+; CHECK-DAG:    or.b64       [[P2_or_3:%rd[0-9]+]], [[P2_5_shl]], [[P2_4]];
+; CHECK-DAG:    or.b64       [[P2_or_4:%rd[0-9]+]], [[P2_7_shl]], [[P2_6_shl]];
+; CHECK-DAG:    or.b64       [[P2_or_5:%rd[0-9]+]], [[P2_or_4]], [[P2_or_3]];
+; CHECK-DAG:    shl.b64      [[P2_or_shl:%rd[0-9]+]], [[P2_or_5]], 32;
+; CHECK-DAG:    or.b64       [[P2:%rd[0-9]+]], [[P2_or_shl]], [[P2_or_2]];
+; CHECK-DAG:    shr.u64      [[P2_shr_1:%rd[0-9]+]], [[P2]], 8;
+; CHECK-DAG:    shr.u64      [[P2_shr_2:%rd[0-9]+]], [[P2]], 16;
+; CHECK-DAG:    shr.u64      [[P2_shr_3:%rd[0-9]+]], [[P2]], 24;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_4:%rd[0-9]+]], [[P2_or_5]], 8, 24;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_5:%rd[0-9]+]], [[P2_or_5]], 16, 16;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_6:%rd[0-9]+]], [[P2_or_5]], 24, 8;
+; CHECK:        { // callseq
+; CHECK:        .param .align 8 .b8 param0[32];
+; CHECK-DAG:    st.param.b64 [param0+0],  [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+9],  [[P2]];
+; CHECK-DAG:    st.param.b8  [param0+10], [[P2_shr_1]];
+; CHECK-DAG:    st.param.b8  [param0+11], [[P2_shr_2]];
+; CHECK-DAG:    st.param.b8  [param0+12], [[P2_shr_3]];
+; CHECK-DAG:    st.param.b8  [param0+13], [[P2_or_5]];
+; CHECK-DAG:    st.param.b8  [param0+14], [[P2_bfe_4]];
+; CHECK-DAG:    st.param.b8  [param0+15], [[P2_bfe_5]];
+; CHECK-DAG:    st.param.b8  [param0+16], [[P2_bfe_6]];
+; CHECK:        .param .align 8 .b8 retval0[32];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8i64p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.b64 [[R0:%rd[0-9]+]],   [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+9];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+10];
+; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+11];
+; CHECK-DAG:    ld.param.b8  [[R2_3:%rs[0-9]+]], [retval0+12];
+; CHECK-DAG:    ld.param.b8  [[R2_4:%rs[0-9]+]], [retval0+13];
+; CHECK-DAG:    ld.param.b8  [[R2_5:%rs[0-9]+]], [retval0+14];
+; CHECK-DAG:    ld.param.b8  [[R2_6:%rs[0-9]+]], [retval0+15];
+; CHECK-DAG:    ld.param.b8  [[R2_7:%rs[0-9]+]], [retval0+16];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.b64 [func_retval0+0], [[R0]];
+; CHECK-DAG:    st.param.b8  [func_retval0+9],
+; CHECK-DAG:    st.param.b8  [func_retval0+10],
+; CHECK-DAG:    st.param.b8  [func_retval0+11],
+; CHECK-DAG:    st.param.b8  [func_retval0+12],
+; CHECK-DAG:    st.param.b8  [func_retval0+13],
+; CHECK-DAG:    st.param.b8  [func_retval0+14],
+; CHECK-DAG:    st.param.b8  [func_retval0+15],
+; CHECK-DAG:    st.param.b8  [func_retval0+16],
+; CHECK:        ret;
+
+define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) {
+       %r = tail call %s_i8i64p @test_s_i8i64p(%s_i8i64p %a)
+       ret %s_i8i64p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[16])
+; CHECK-LABEL: test_s_i8f16p(
+; CHECK:        .param .align 8 .b8 test_s_i8f16p_param_0[16]
+; CHECK-DAG:    ld.param.b16 [[P0:%rs[0-9]+]],     [test_s_i8f16p_param_0];
+; CHECK-DAG:    ld.param.u8  [[P2_0:%rs[0-9]+]],   [test_s_i8f16p_param_0+3];
+; CHECK-DAG:    ld.param.u8  [[P2_1:%rs[0-9]+]],   [test_s_i8f16p_param_0+4];
+; CHECK-DAG:    shl.b16      [[P2_1_shl:%rs[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    or.b16       [[P2_1_or:%rs[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK:        { // callseq
+; CHECK:        .param .align 8 .b8 param0[16];
+; CHECK-DAG:    st.param.b16 [param0+0], [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+3], [[P2_1_or]];
+; CHECK-DAG:    st.param.b8  [param0+4], [[P2_1]];
+; CHECK:        .param .align 8 .b8 retval0[16];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8f16p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.b16 [[R0:%rs[0-9]+]],     [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2I_0:%rs[0-9]+]], [retval0+3];
+; CHECK-DAG:    ld.param.b8  [[R2I_1:%rs[0-9]+]], [retval0+4];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.b16 [func_retval0+0], [[R0]];
+; CHECK-DAG:    shl.b16      [[R2I_1_shl:%rs[0-9]+]], [[R2I_1]], 8;
+; CHECK-DAG:    and.b16      [[R2I_0_and:%rs[0-9]+]], [[R2I_0]], 255;
+; CHECK-DAG:    or.b16       [[R2I:%rs[0-9]+]], [[R2I_0_and]], [[R2I_1_shl]];
+; CHECK-DAG:    st.param.b8  [func_retval0+3],  [[R2I]];
+; CHECK-DAG:    and.b16      [[R2I_1_and:%rs[0-9]+]], [[R2I_1]], 255;
+; CHECK-DAG:    st.param.b8  [func_retval0+4],  [[R2I_1_and]];
+; CHECK:        ret;
+
+define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) {
+       %r = tail call %s_i8f16p @test_s_i8f16p(%s_i8f16p %a)
+       ret %s_i8f16p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i8f16x2p(
+; CHECK:        .param .align 8 .b8 test_s_i8f16x2p_param_0[24]
+; CHECK-DAG:    ld.param.b32 [[P0:%r[0-9]+]],  [test_s_i8f16x2p_param_0];
+; CHECK-DAG:    ld.param.u8  [[P2_0:%r[0-9]+]],   [test_s_i8f16x2p_param_0+5];
+; CHECK-DAG:    ld.param.u8  [[P2_1:%r[0-9]+]],   [test_s_i8f16x2p_param_0+6];
+; CHECK-DAG:    ld.param.u8  [[P2_2:%r[0-9]+]],   [test_s_i8f16x2p_param_0+7];
+; CHECK-DAG:    ld.param.u8  [[P2_3:%r[0-9]+]],   [test_s_i8f16x2p_param_0+8];
+; CHECK-DAG:    shl.b32      [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    shl.b32      [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG:    shl.b32      [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG:    or.b32       [[P2_or:%r[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG:    or.b32       [[P2_or_1:%r[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG:    or.b32       [[P2:%r[0-9]+]], [[P2_or_1]], [[P2_or]];
+; CHECK-DAG:    shr.u32      [[P2_1_shr:%r[0-9]+]], [[P2]], 8;
+; CHECK-DAG:    shr.u32      [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16;
+; CHECK:        { // callseq
+; CHECK-DAG:    .param .align 8 .b8 param0[24];
+; CHECK-DAG:    st.param.b32 [param0+0], [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+5], [[P2]];
+; CHECK-DAG:    st.param.b8  [param0+6], [[P2_1_shr]];
+; CHECK-DAG:    st.param.b8  [param0+7], [[P2_2_shr]];
+; CHECK-DAG:    st.param.b8  [param0+8], [[P2_3]];
+; CHECK:        .param .align 8 .b8 retval0[24];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8f16x2p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.b32 [[R0:%r[0-9]+]],   [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+5];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+6];
+; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+7];
+; CHECK-DAG:    ld.param.b8  [[R2_3:%rs[0-9]+]], [retval0+8];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.b32 [func_retval0+0], [[R0]];
+; CHECK-DAG:    st.param.b8  [func_retval0+5],
+; CHECK-DAG:    st.param.b8  [func_retval0+6],
+; CHECK-DAG:    st.param.b8  [func_retval0+7],
+; CHECK-DAG:    st.param.b8  [func_retval0+8],
+; CHECK:        ret;
+
+define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) {
+       %r = tail call %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a)
+       ret %s_i8f16x2p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i8f32p(
+; CHECK:        .param .align 8 .b8 test_s_i8f32p_param_0[24]
+; CHECK-DAG:    ld.param.f32 [[P0:%f[0-9]+]],    [test_s_i8f32p_param_0];
+; CHECK-DAG:    ld.param.u8  [[P2_0:%r[0-9]+]],   [test_s_i8f32p_param_0+5];
+; CHECK-DAG:    ld.param.u8  [[P2_1:%r[0-9]+]],   [test_s_i8f32p_param_0+6];
+; CHECK-DAG:    ld.param.u8  [[P2_2:%r[0-9]+]],   [test_s_i8f32p_param_0+7];
+; CHECK-DAG:    ld.param.u8  [[P2_3:%r[0-9]+]],   [test_s_i8f32p_param_0+8];
+; CHECK-DAG:    shl.b32      [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    shl.b32      [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG:    shl.b32      [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG:    or.b32       [[P2_or:%r[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG:    or.b32       [[P2_or_1:%r[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG:    or.b32       [[P2:%r[0-9]+]], [[P2_or_1]], [[P2_or]];
+; CHECK-DAG:    shr.u32      [[P2_1_shr:%r[0-9]+]], [[P2]], 8;
+; CHECK-DAG:    shr.u32      [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16;
+; CHECK:        { // callseq
+; CHECK-DAG:    .param .align 8 .b8 param0[24];
+; CHECK-DAG:    st.param.f32 [param0+0], [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+5], [[P2]];
+; CHECK-DAG:    st.param.b8  [param0+6], [[P2_1_shr]];
+; CHECK-DAG:    st.param.b8  [param0+7], [[P2_2_shr]];
+; CHECK-DAG:    st.param.b8  [param0+8], [[P2_3]];
+; CHECK:        .param .align 8 .b8 retval0[24];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8f32p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.f32 [[R0:%f[0-9]+]],    [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+5];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+6];
+; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+7];
+; CHECK-DAG:    ld.param.b8  [[R2_3:%rs[0-9]+]], [retval0+8];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.f32 [func_retval0+0], [[R0]];
+; CHECK-DAG:    st.param.b8  [func_retval0+5],
+; CHECK-DAG:    st.param.b8  [func_retval0+6],
+; CHECK-DAG:    st.param.b8  [func_retval0+7],
+; CHECK-DAG:    st.param.b8  [func_retval0+8],
+; CHECK:        ret;
+
+define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) {
+       %r = tail call %s_i8f32p @test_s_i8f32p(%s_i8f32p %a)
+       ret %s_i8f32p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[32])
+; CHECK-LABEL: test_s_i8f64p(
+; CHECK:        .param .align 8 .b8 test_s_i8f64p_param_0[32]
+; CHECK-DAG:    ld.param.f64 [[P0:%fd[0-9]+]],    [test_s_i8f64p_param_0];
+; CHECK-DAG:    ld.param.u8  [[P2_0:%rd[0-9]+]],   [test_s_i8f64p_param_0+9];
+; CHECK-DAG:    ld.param.u8  [[P2_1:%rd[0-9]+]],   [test_s_i8f64p_param_0+10];
+; CHECK-DAG:    ld.param.u8  [[P2_2:%rd[0-9]+]],   [test_s_i8f64p_param_0+11];
+; CHECK-DAG:    ld.param.u8  [[P2_3:%rd[0-9]+]],   [test_s_i8f64p_param_0+12];
+; CHECK-DAG:    ld.param.u8  [[P2_4:%rd[0-9]+]],   [test_s_i8f64p_param_0+13];
+; CHECK-DAG:    ld.param.u8  [[P2_5:%rd[0-9]+]],   [test_s_i8f64p_param_0+14];
+; CHECK-DAG:    ld.param.u8  [[P2_6:%rd[0-9]+]],   [test_s_i8f64p_param_0+15];
+; CHECK-DAG:    ld.param.u8  [[P2_7:%rd[0-9]+]],   [test_s_i8f64p_param_0+16];
+; CHECK-DAG:    shl.b64      [[P2_1_shl:%rd[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    shl.b64      [[P2_2_shl:%rd[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG:    shl.b64      [[P2_3_shl:%rd[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG:    or.b64       [[P2_or_0:%rd[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG:    or.b64       [[P2_or_1:%rd[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG:    or.b64       [[P2_or_2:%rd[0-9]+]], [[P2_or_1]], [[P2_or_0]];
+; CHECK-DAG:    shl.b64 	 [[P2_5_shl:%rd[0-9]+]], [[P2_5]], 8;
+; CHECK-DAG:    shl.b64      [[P2_6_shl:%rd[0-9]+]], [[P2_6]], 16;
+; CHECK-DAG:    shl.b64      [[P2_7_shl:%rd[0-9]+]], [[P2_7]], 24;
+; CHECK-DAG:    or.b64       [[P2_or_3:%rd[0-9]+]], [[P2_5_shl]], [[P2_4]];
+; CHECK-DAG:    or.b64       [[P2_or_4:%rd[0-9]+]], [[P2_7_shl]], [[P2_6_shl]];
+; CHECK-DAG:    or.b64       [[P2_or_5:%rd[0-9]+]], [[P2_or_4]], [[P2_or_3]];
+; CHECK-DAG:    shl.b64      [[P2_or_shl:%rd[0-9]+]], [[P2_or_5]], 32;
+; CHECK-DAG:    or.b64       [[P2:%rd[0-9]+]], [[P2_or_shl]], [[P2_or_2]];
+; CHECK-DAG:    shr.u64      [[P2_shr_1:%rd[0-9]+]], [[P2]], 8;
+; CHECK-DAG:    shr.u64      [[P2_shr_2:%rd[0-9]+]], [[P2]], 16;
+; CHECK-DAG:    shr.u64      [[P2_shr_3:%rd[0-9]+]], [[P2]], 24;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_4:%rd[0-9]+]], [[P2_or_5]], 8, 24;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_5:%rd[0-9]+]], [[P2_or_5]], 16, 16;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_6:%rd[0-9]+]], [[P2_or_5]], 24, 8;
+; CHECK:        { // callseq
+; CHECK:        .param .align 8 .b8 param0[32];
+; CHECK-DAG:    st.param.f64 [param0+0],  [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+9],  [[P2]];
+; CHECK-DAG:    st.param.b8  [param0+10], [[P2_shr_1]];
+; CHECK-DAG:    st.param.b8  [param0+11], [[P2_shr_2]];
+; CHECK-DAG:    st.param.b8  [param0+12], [[P2_shr_3]];
+; CHECK-DAG:    st.param.b8  [param0+13], [[P2_or_5]];
+; CHECK-DAG:    st.param.b8  [param0+14], [[P2_bfe_4]];
+; CHECK-DAG:    st.param.b8  [param0+15], [[P2_bfe_5]];
+; CHECK-DAG:    st.param.b8  [param0+16], [[P2_bfe_6]];
+; CHECK:        .param .align 8 .b8 retval0[32];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8f64p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.f64 [[R0:%fd[0-9]+]],   [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+9];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+10];
+; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+11];
+; CHECK-DAG:    ld.param.b8  [[R2_3:%rs[0-9]+]], [retval0+12];
+; CHECK-DAG:    ld.param.b8  [[R2_4:%rs[0-9]+]], [retval0+13];
+; CHECK-DAG:    ld.param.b8  [[R2_5:%rs[0-9]+]], [retval0+14];
+; CHECK-DAG:    ld.param.b8  [[R2_6:%rs[0-9]+]], [retval0+15];
+; CHECK-DAG:    ld.param.b8  [[R2_7:%rs[0-9]+]], [retval0+16];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.f64 [func_retval0+0], [[R0]];
+; CHECK-DAG:    st.param.b8  [func_retval0+9],
+; CHECK-DAG:    st.param.b8  [func_retval0+10],
+; CHECK-DAG:    st.param.b8  [func_retval0+11],
+; CHECK-DAG:    st.param.b8  [func_retval0+12],
+; CHECK-DAG:    st.param.b8  [func_retval0+13],
+; CHECK-DAG:    st.param.b8  [func_retval0+14],
+; CHECK-DAG:    st.param.b8  [func_retval0+15],
+; CHECK-DAG:    st.param.b8  [func_retval0+16],
+; CHECK:        ret;
+
+define %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) {
+       %r = tail call %s_i8f64p @test_s_i8f64p(%s_i8f64p %a)
+       ret %s_i8f64p %r
+}