[clang] f854434 - [NVPTX] Enhance vectorization of ld.param & st.param

Daniil Kovalev via cfe-commits cfe-commits at lists.llvm.org
Thu Mar 24 02:25:48 PDT 2022


Author: Daniil Kovalev
Date: 2022-03-24T12:25:36+03:00
New Revision: f854434f0f2a01027bdaad8e6fdac5a782fce291

URL: https://github.com/llvm/llvm-project/commit/f854434f0f2a01027bdaad8e6fdac5a782fce291
DIFF: https://github.com/llvm/llvm-project/commit/f854434f0f2a01027bdaad8e6fdac5a782fce291.diff

LOG: [NVPTX] Enhance vectorization of ld.param & st.param

Since function parameters and return values are passed via param space, we
can force special alignment for values hold in it which will add vectorization
options. This change may be done if the function has private or internal
linkage. Special alignment is forced during 2 phases.

1) Instruction selection lowering. Here we use special alignment for function
   prototypes (changing both own return value and parameters alignment), call
   lowering (changing both callee's return value and parameters alignment).

2) IR pass nvptx-lower-args. Here we change alignment of byval parameters that
   belong to param space (or are casted to it). We only handle cases when all
   uses of such parameters are loads from it. For such loads, we can change the
   alignment according to special type alignment and the load offset. Then,
   load-store-vectorizer IR pass will perform vectorization where alignment
   allows it.

Special alignment calculated as maximum from default ABI type alignment and
alignment 16. Alignment 16 is chosen because it's the maximum size of
vectorized ld.param & st.param.

Before specifying such special alignment, we should check if it is a multiple
of the alignment that the type already has. For example, if a value has an
enforced alignment of 64, default ABI alignment of 4 and special alignment
of 16, we should preserve 64.

This patch will be followed by a refactoring patch that removes duplicating
code in handling byval and non-byval arguments.

Differential Revision: https://reviews.llvm.org/D121549

Added: 
    llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
    llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll

Modified: 
    clang/test/CodeGenCUDA/device-fun-linkage.cu
    llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
    llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
    llvm/lib/Target/NVPTX/NVPTXISelLowering.h
    llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp

Removed: 
    


################################################################################
diff  --git a/clang/test/CodeGenCUDA/device-fun-linkage.cu b/clang/test/CodeGenCUDA/device-fun-linkage.cu
index d1b9db261151b..d8ad6d438be9c 100644
--- a/clang/test/CodeGenCUDA/device-fun-linkage.cu
+++ b/clang/test/CodeGenCUDA/device-fun-linkage.cu
@@ -1,19 +1,32 @@
-// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
-// RUN:   -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -emit-llvm -o - %s \
 // RUN:   | FileCheck -check-prefix=NORDC %s
-// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
-// RUN:   -fgpu-rdc -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -emit-llvm -o - %s \
+// RUN:   | FileCheck -check-prefix=NORDC-NEG %s
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -fgpu-rdc -emit-llvm -o - %s \
 // RUN:   | FileCheck -check-prefix=RDC %s
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -fgpu-rdc -emit-llvm -o - %s \
+// RUN:   | FileCheck -check-prefix=RDC-NEG %s
 
 #include "Inputs/cuda.h"
 
-// NORDC: define internal void @_Z4funcIiEvv()
-// NORDC: define{{.*}} void @_Z6kernelIiEvv()
-// RDC: define weak_odr void @_Z4funcIiEvv()
-// RDC: define weak_odr void @_Z6kernelIiEvv()
-
 template <typename T> __device__ void func() {}
 template <typename T> __global__ void kernel() {}
 
 template __device__ void func<int>();
+// NORDC:     define internal void @_Z4funcIiEvv()
+// RDC:       define weak_odr void @_Z4funcIiEvv()
+
 template __global__ void kernel<int>();
+// NORDC:     define void @_Z6kernelIiEvv()
+// RDC:       define weak_odr void @_Z6kernelIiEvv()
+
+// Ensure that unused static device function is eliminated
+static __device__ void static_func() {}
+// NORDC-NEG-NOT: define{{.*}} void @_ZL13static_funcv()
+// RDC-NEG-NOT:   define{{.*}} void @_ZL13static_funcv()
+
+// Ensure that kernel function has external or weak_odr
+// linkage regardless static specifier
+static __global__ void static_kernel() {}
+// NORDC:     define void @_ZL13static_kernelv()
+// RDC:       define weak_odr void @_ZL13static_kernelv()

diff  --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 2516dff52efdf..e8322a0a8425b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -329,7 +329,7 @@ MCOperand NVPTXAsmPrinter::GetSymbolRef(const MCSymbol *Symbol) {
 void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
   const DataLayout &DL = getDataLayout();
   const NVPTXSubtarget &STI = TM.getSubtarget<NVPTXSubtarget>(*F);
-  const TargetLowering *TLI = STI.getTargetLowering();
+  const auto *TLI = cast<NVPTXTargetLowering>(STI.getTargetLowering());
 
   Type *Ty = F->getReturnType();
 
@@ -363,7 +363,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
       unsigned totalsz = DL.getTypeAllocSize(Ty);
       unsigned retAlignment = 0;
       if (!getAlign(*F, 0, retAlignment))
-        retAlignment = DL.getABITypeAlignment(Ty);
+        retAlignment = TLI->getFunctionParamOptimizedAlign(F, Ty, DL).value();
       O << ".param .align " << retAlignment << " .b8 func_retval0[" << totalsz
         << "]";
     } else
@@ -1348,7 +1348,8 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
   const DataLayout &DL = getDataLayout();
   const AttributeList &PAL = F->getAttributes();
   const NVPTXSubtarget &STI = TM.getSubtarget<NVPTXSubtarget>(*F);
-  const TargetLowering *TLI = STI.getTargetLowering();
+  const auto *TLI = cast<NVPTXTargetLowering>(STI.getTargetLowering());
+
   Function::const_arg_iterator I, E;
   unsigned paramIndex = 0;
   bool first = true;
@@ -1405,18 +1406,24 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       }
     }
 
+    auto getOptimalAlignForParam = [TLI, &DL, &PAL, F,
+                                    paramIndex](Type *Ty) -> Align {
+      Align TypeAlign = TLI->getFunctionParamOptimizedAlign(F, Ty, DL);
+      MaybeAlign ParamAlign = PAL.getParamAlignment(paramIndex);
+      return max(TypeAlign, ParamAlign);
+    };
+
     if (!PAL.hasParamAttr(paramIndex, Attribute::ByVal)) {
       if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
         // Just print .param .align <a> .b8 .param[size];
-        // <a> = PAL.getparamalignment
+        // <a>  = optimal alignment for the element type; always multiple of
+        //        PAL.getParamAlignment
         // size = typeallocsize of element type
-        const Align align = DL.getValueOrABITypeAlignment(
-            PAL.getParamAlignment(paramIndex), Ty);
+        Align OptimalAlign = getOptimalAlignForParam(Ty);
 
-        unsigned sz = DL.getTypeAllocSize(Ty);
-        O << "\t.param .align " << align.value() << " .b8 ";
+        O << "\t.param .align " << OptimalAlign.value() << " .b8 ";
         printParamName(I, paramIndex, O);
-        O << "[" << sz << "]";
+        O << "[" << DL.getTypeAllocSize(Ty) << "]";
 
         continue;
       }
@@ -1492,10 +1499,11 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
 
     if (isABI || isKernelFunc) {
       // Just print .param .align <a> .b8 .param[size];
-      // <a> = PAL.getparamalignment
+      // <a>  = optimal alignment for the element type; always multiple of
+      //        PAL.getParamAlignment
       // size = typeallocsize of element type
-      Align align =
-          DL.getValueOrABITypeAlignment(PAL.getParamAlignment(paramIndex), ETy);
+      Align OptimalAlign = getOptimalAlignForParam(ETy);
+
       // Work around a bug in ptxas. When PTX code takes address of
       // byval parameter with alignment < 4, ptxas generates code to
       // spill argument into memory. Alas on sm_50+ ptxas generates
@@ -1507,10 +1515,10 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       // TODO: this will need to be undone when we get to support multi-TU
       // device-side compilation as it breaks ABI compatibility with nvcc.
       // Hopefully ptxas bug is fixed by then.
-      if (!isKernelFunc && align < Align(4))
-        align = Align(4);
+      if (!isKernelFunc && OptimalAlign < Align(4))
+        OptimalAlign = Align(4);
       unsigned sz = DL.getTypeAllocSize(ETy);
-      O << "\t.param .align " << align.value() << " .b8 ";
+      O << "\t.param .align " << OptimalAlign.value() << " .b8 ";
       printParamName(I, paramIndex, O);
       O << "[" << sz << "]";
       continue;

diff  --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 2cda034f047c1..382e83dbb4cb9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1302,8 +1302,8 @@ std::string NVPTXTargetLowering::getPrototype(
 
   bool first = true;
 
-  unsigned OIdx = 0;
-  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
+  const Function *F = CB.getFunction();
+  for (unsigned i = 0, e = Args.size(), OIdx = 0; i != e; ++i, ++OIdx) {
     Type *Ty = Args[i].Ty;
     if (!first) {
       O << ", ";
@@ -1312,15 +1312,14 @@ std::string NVPTXTargetLowering::getPrototype(
 
     if (!Outs[OIdx].Flags.isByVal()) {
       if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
-        unsigned align = 0;
+        unsigned ParamAlign = 0;
         const CallInst *CallI = cast<CallInst>(&CB);
         // +1 because index 0 is reserved for return type alignment
-        if (!getAlign(*CallI, i + 1, align))
-          align = DL.getABITypeAlignment(Ty);
-        unsigned sz = DL.getTypeAllocSize(Ty);
-        O << ".param .align " << align << " .b8 ";
+        if (!getAlign(*CallI, i + 1, ParamAlign))
+          ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value();
+        O << ".param .align " << ParamAlign << " .b8 ";
         O << "_";
-        O << "[" << sz << "]";
+        O << "[" << DL.getTypeAllocSize(Ty) << "]";
         // update the index for Outs
         SmallVector<EVT, 16> vtparts;
         ComputeValueVTs(*this, DL, Ty, vtparts);
@@ -1352,11 +1351,17 @@ std::string NVPTXTargetLowering::getPrototype(
       continue;
     }
 
-    Align align = Outs[OIdx].Flags.getNonZeroByValAlign();
-    unsigned sz = Outs[OIdx].Flags.getByValSize();
-    O << ".param .align " << align.value() << " .b8 ";
+    Align ParamByValAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
+
+    // Try to increase alignment. This code matches logic in LowerCall when
+    // alignment increase is performed to increase vectorization options.
+    Type *ETy = Args[i].IndirectType;
+    Align AlignCandidate = getFunctionParamOptimizedAlign(F, ETy, DL);
+    ParamByValAlign = std::max(ParamByValAlign, AlignCandidate);
+
+    O << ".param .align " << ParamByValAlign.value() << " .b8 ";
     O << "_";
-    O << "[" << sz << "]";
+    O << "[" << Outs[OIdx].Flags.getByValSize() << "]";
   }
   O << ");";
   return O.str();
@@ -1403,12 +1408,15 @@ Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
 
   // Check for function alignment information if we found that the
   // ultimate target is a Function
-  if (DirectCallee)
+  if (DirectCallee) {
     if (getAlign(*DirectCallee, Idx, Alignment))
       return Align(Alignment);
+    // If alignment information is not available, fall back to the
+    // default function param optimized type alignment
+    return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL);
+  }
 
-  // Call is indirect or alignment information is not available, fall back to
-  // the ABI type alignment
+  // Call is indirect, fall back to the ABI type alignment
   return DL.getABITypeAlign(Ty);
 }
 
@@ -1569,18 +1577,26 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     }
 
     // ByVal arguments
+    // TODO: remove code duplication when handling byval and non-byval cases.
     SmallVector<EVT, 16> VTs;
     SmallVector<uint64_t, 16> Offsets;
-    assert(Args[i].IndirectType && "byval arg must have indirect type");
-    ComputePTXValueVTs(*this, DL, Args[i].IndirectType, VTs, &Offsets, 0);
+    Type *ETy = Args[i].IndirectType;
+    assert(ETy && "byval arg must have indirect type");
+    ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, 0);
 
     // declare .param .align <align> .b8 .param<n>[<size>];
     unsigned sz = Outs[OIdx].Flags.getByValSize();
     SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-    Align ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
+
     // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
     // so we don't need to worry about natural alignment or not.
     // See TargetLowering::LowerCallTo().
+    Align ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
+
+    // Try to increase alignment to enhance vectorization options.
+    const Function *F = CB->getCalledFunction();
+    Align AlignCandidate = getFunctionParamOptimizedAlign(F, ETy, DL);
+    ArgAlign = std::max(ArgAlign, AlignCandidate);
 
     // Enforce minumum alignment of 4 to work around ptxas miscompile
     // for sm_50+. See corresponding alignment adjustment in
@@ -1594,29 +1610,67 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
                         DeclareParamOps);
     InFlag = Chain.getValue(1);
+
+    auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
+    SmallVector<SDValue, 6> StoreOperands;
     for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
       EVT elemtype = VTs[j];
       int curOffset = Offsets[j];
-      unsigned PartAlign = GreatestCommonDivisor64(ArgAlign.value(), curOffset);
+      Align PartAlign = commonAlignment(ArgAlign, curOffset);
+
+      // New store.
+      if (VectorInfo[j] & PVF_FIRST) {
+        assert(StoreOperands.empty() && "Unfinished preceding store.");
+        StoreOperands.push_back(Chain);
+        StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
+        StoreOperands.push_back(DAG.getConstant(curOffset, dl, MVT::i32));
+      }
+
       auto PtrVT = getPointerTy(DL);
       SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
                                     DAG.getConstant(curOffset, dl, PtrVT));
       SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
                                    MachinePointerInfo(), PartAlign);
+
       if (elemtype.getSizeInBits() < 16) {
+        // Use 16-bit registers for small stores as it's the
+        // smallest general purpose register size supported by NVPTX.
         theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
       }
-      SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-      SDValue CopyParamOps[] = { Chain,
-                                 DAG.getConstant(paramCount, dl, MVT::i32),
-                                 DAG.getConstant(curOffset, dl, MVT::i32),
-                                 theVal, InFlag };
-      Chain = DAG.getMemIntrinsicNode(
-          NVPTXISD::StoreParam, dl, CopyParamVTs, CopyParamOps, elemtype,
-          MachinePointerInfo(), /* Align */ None, MachineMemOperand::MOStore);
 
-      InFlag = Chain.getValue(1);
+      // Record the value to store.
+      StoreOperands.push_back(theVal);
+
+      if (VectorInfo[j] & PVF_LAST) {
+        unsigned NumElts = StoreOperands.size() - 3;
+        NVPTXISD::NodeType Op;
+        switch (NumElts) {
+        case 1:
+          Op = NVPTXISD::StoreParam;
+          break;
+        case 2:
+          Op = NVPTXISD::StoreParamV2;
+          break;
+        case 4:
+          Op = NVPTXISD::StoreParamV4;
+          break;
+        default:
+          llvm_unreachable("Invalid vector info.");
+        }
+
+        StoreOperands.push_back(InFlag);
+
+        Chain = DAG.getMemIntrinsicNode(
+            Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
+            elemtype, MachinePointerInfo(), PartAlign,
+            MachineMemOperand::MOStore);
+        InFlag = Chain.getValue(1);
+
+        // Cleanup.
+        StoreOperands.clear();
+      }
     }
+    assert(StoreOperands.empty() && "Unfinished parameter store.");
     ++paramCount;
   }
 
@@ -2617,7 +2671,8 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
                                  const SmallVectorImpl<SDValue> &OutVals,
                                  const SDLoc &dl, SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const Function &F = MF.getFunction();
   Type *RetTy = MF.getFunction().getReturnType();
 
   bool isABI = (STI.getSmVersion() >= 20);
@@ -2632,7 +2687,9 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
 
   auto VectorInfo = VectorizePTXValueVTs(
-      VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlign(RetTy) : Align(1));
+      VTs, Offsets,
+      RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL)
+                       : Align(1));
 
   // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
   // 32-bits are sign extended or zero extended, depending on whether
@@ -4252,6 +4309,55 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   return false;
 }
 
+/// getFunctionParamOptimizedAlign - since function arguments are passed via
+/// .param space, we may want to increase their alignment in a way that
+/// ensures that we can effectively vectorize their loads & stores. We can
+/// increase alignment only if the function has internal or has private
+/// linkage as for other linkage types callers may already rely on default
+/// alignment. To allow using 128-bit vectorized loads/stores, this function
+/// ensures that alignment is 16 or greater.
+Align NVPTXTargetLowering::getFunctionParamOptimizedAlign(
+    const Function *F, Type *ArgTy, const DataLayout &DL) const {
+  const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value();
+
+  // If a function has linkage 
diff erent from internal or private, we
+  // must use default ABI alignment as external users rely on it.
+  switch (F->getLinkage()) {
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::PrivateLinkage: {
+    // Check that if a function has internal or private linkage
+    // it is not a kernel.
+#ifndef NDEBUG
+    const NamedMDNode *NMDN =
+        F->getParent()->getNamedMetadata("nvvm.annotations");
+    if (NMDN) {
+      for (const MDNode *MDN : NMDN->operands()) {
+        assert(MDN->getNumOperands() == 3);
+
+        const Metadata *MD0 = MDN->getOperand(0).get();
+        const auto *MDV0 = cast<ConstantAsMetadata>(MD0)->getValue();
+        const auto *MDFn = cast<Function>(MDV0);
+        if (MDFn != F)
+          continue;
+
+        const Metadata *MD1 = MDN->getOperand(1).get();
+        const MDString *MDStr = cast<MDString>(MD1);
+        if (MDStr->getString() != "kernel")
+          continue;
+
+        const Metadata *MD2 = MDN->getOperand(2).get();
+        const auto *MDV2 = cast<ConstantAsMetadata>(MD2)->getValue();
+        assert(!cast<ConstantInt>(MDV2)->isZero());
+      }
+    }
+#endif
+    return Align(std::max(uint64_t(16), ABITypeAlign));
+  }
+  default:
+    return Align(ABITypeAlign);
+  }
+}
+
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
 /// Used to guide target specific optimizations, like loop strength reduction

diff  --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 13829b924d4b4..18a697deacb44 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -451,6 +451,16 @@ class NVPTXTargetLowering : public TargetLowering {
                           MachineFunction &MF,
                           unsigned Intrinsic) const override;
 
+  /// getFunctionParamOptimizedAlign - since function arguments are passed via
+  /// .param space, we may want to increase their alignment in a way that
+  /// ensures that we can effectively vectorize their loads & stores. We can
+  /// increase alignment only if the function has internal or has private
+  /// linkage as for other linkage types callers may already rely on default
+  /// alignment. To allow using 128-bit vectorized loads/stores, this function
+  /// ensures that alignment is 16 or greater.
+  Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy,
+                                       const DataLayout &DL) const;
+
   /// isLegalAddressingMode - Return true if the addressing mode represented
   /// by AM is legal for this target, for a load/store of the specified type
   /// Used to guide target specific optimizations, like loop strength

diff  --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 6183019de43df..19b04f49d76ca 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -88,16 +88,17 @@
 // cancel the addrspacecast pair this pass emits.
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/NVPTXBaseInfo.h"
 #include "NVPTX.h"
 #include "NVPTXTargetMachine.h"
 #include "NVPTXUtilities.h"
-#include "MCTargetDesc/NVPTXBaseInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
+#include <queue>
 
 #define DEBUG_TYPE "nvptx-lower-args"
 
@@ -226,6 +227,90 @@ static void convertToParamAS(Value *OldUser, Value *Param) {
                  [](Instruction *I) { I->eraseFromParent(); });
 }
 
+// Adjust alignment of arguments passed byval in .param address space. We can
+// increase alignment of such arguments in a way that ensures that we can
+// effectively vectorize their loads. We should also traverse all loads from
+// byval pointer and adjust their alignment, if those were using known offset.
+// Such alignment changes must be conformed with parameter store and load in
+// NVPTXTargetLowering::LowerCall.
+static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
+                                    const NVPTXTargetLowering *TLI) {
+  Function *Func = Arg->getParent();
+  Type *StructType = Arg->getParamByValType();
+  const DataLayout DL(Func->getParent());
+
+  uint64_t NewArgAlign =
+      TLI->getFunctionParamOptimizedAlign(Func, StructType, DL).value();
+  uint64_t CurArgAlign =
+      Arg->getAttribute(Attribute::Alignment).getValueAsInt();
+
+  if (CurArgAlign >= NewArgAlign)
+    return;
+
+  LLVM_DEBUG(dbgs() << "Try to use alignment " << NewArgAlign << " instead of "
+                    << CurArgAlign << " for " << *Arg << '\n');
+
+  auto NewAlignAttr =
+      Attribute::get(Func->getContext(), Attribute::Alignment, NewArgAlign);
+  Arg->removeAttr(Attribute::Alignment);
+  Arg->addAttr(NewAlignAttr);
+
+  struct Load {
+    LoadInst *Inst;
+    uint64_t Offset;
+  };
+
+  struct LoadContext {
+    Value *InitialVal;
+    uint64_t Offset;
+  };
+
+  SmallVector<Load> Loads;
+  std::queue<LoadContext> Worklist;
+  Worklist.push({ArgInParamAS, 0});
+
+  while (!Worklist.empty()) {
+    LoadContext Ctx = Worklist.front();
+    Worklist.pop();
+
+    for (User *CurUser : Ctx.InitialVal->users()) {
+      if (auto *I = dyn_cast<LoadInst>(CurUser)) {
+        Loads.push_back({I, Ctx.Offset});
+        continue;
+      }
+
+      if (auto *I = dyn_cast<BitCastInst>(CurUser)) {
+        Worklist.push({I, Ctx.Offset});
+        continue;
+      }
+
+      if (auto *I = dyn_cast<GetElementPtrInst>(CurUser)) {
+        APInt OffsetAccumulated =
+            APInt::getZero(DL.getIndexSizeInBits(ADDRESS_SPACE_PARAM));
+
+        if (!I->accumulateConstantOffset(DL, OffsetAccumulated))
+          continue;
+
+        uint64_t OffsetLimit = -1;
+        uint64_t Offset = OffsetAccumulated.getLimitedValue(OffsetLimit);
+        assert(Offset != OffsetLimit && "Expect Offset less than UINT64_MAX");
+
+        Worklist.push({I, Ctx.Offset + Offset});
+        continue;
+      }
+
+      llvm_unreachable("All users must be one of: load, "
+                       "bitcast, getelementptr.");
+    }
+  }
+
+  for (Load &CurLoad : Loads) {
+    Align NewLoadAlign(greatestCommonDivisor(NewArgAlign, CurLoad.Offset));
+    Align CurLoadAlign(CurLoad.Inst->getAlign());
+    CurLoad.Inst->setAlignment(std::max(NewLoadAlign, CurLoadAlign));
+  }
+}
+
 void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
   Function *Func = Arg->getParent();
   Instruction *FirstInst = &(Func->getEntryBlock().front());
@@ -270,6 +355,16 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
       convertToParamAS(V, ArgInParamAS);
     });
     LLVM_DEBUG(dbgs() << "No need to copy " << *Arg << "\n");
+
+    // Further optimizations require target lowering info.
+    if (!TM)
+      return;
+
+    const auto *TLI =
+        cast<NVPTXTargetLowering>(TM->getSubtargetImpl()->getTargetLowering());
+
+    adjustByValArgAlignment(Arg, ArgInParamAS, TLI);
+
     return;
   }
 

diff  --git a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
new file mode 100644
index 0000000000000..681faf0bfa534
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
@@ -0,0 +1,801 @@
+; RUN: llc < %s -mtriple=nvptx-unknown-unknown | FileCheck %s
+;
+; Check that parameters of a __device__ function with private or internal
+; linkage called from a __global__ (kernel) function get increased alignment,
+; and additional vectorization is performed on loads/stores with that
+; parameters.
+;
+; Test IR is a minimized version of IR generated with the following command
+; from the source code below:
+; $ clang++ -O3 --cuda-gpu-arch=sm_35 -S -emit-llvm src.cu
+;
+; ----------------------------------------------------------------------------
+; #include <stdint.h>
+;
+; struct St4x1 { uint32_t field[1]; };
+; struct St4x2 { uint32_t field[2]; };
+; struct St4x3 { uint32_t field[3]; };
+; struct St4x4 { uint32_t field[4]; };
+; struct St4x5 { uint32_t field[5]; };
+; struct St4x6 { uint32_t field[6]; };
+; struct St4x7 { uint32_t field[7]; };
+; struct St4x8 { uint32_t field[8]; };
+; struct St8x1 { uint64_t field[1]; };
+; struct St8x2 { uint64_t field[2]; };
+; struct St8x3 { uint64_t field[3]; };
+; struct St8x4 { uint64_t field[4]; };
+;
+; #define DECLARE_CALLEE(StName)                                      \
+; static __device__  __attribute__((noinline))                        \
+; struct StName callee_##StName(struct StName in) {                   \
+;   struct StName ret;                                                \
+;   const unsigned size = sizeof(ret.field) / sizeof(*ret.field);     \
+;   for (unsigned i = 0; i != size; ++i)                              \
+;     ret.field[i] = in.field[i];                                     \
+;   return ret;                                                       \
+; }                                                                   \
+
+; #define DECLARE_CALLER(StName)                                      \
+; __global__                                                          \
+; void caller_##StName(struct StName in, struct StName* ret)          \
+; {                                                                   \
+;   *ret = callee_##StName(in);                                       \
+; }                                                                   \
+;
+; #define DECLARE_CALL(StName)  \
+;     DECLARE_CALLEE(StName)    \
+;     DECLARE_CALLER(StName)    \
+;
+; DECLARE_CALL(St4x1)
+; DECLARE_CALL(St4x2)
+; DECLARE_CALL(St4x3)
+; DECLARE_CALL(St4x4)
+; DECLARE_CALL(St4x5)
+; DECLARE_CALL(St4x6)
+; DECLARE_CALL(St4x7)
+; DECLARE_CALL(St4x8)
+; DECLARE_CALL(St8x1)
+; DECLARE_CALL(St8x2)
+; DECLARE_CALL(St8x3)
+; DECLARE_CALL(St8x4)
+; ----------------------------------------------------------------------------
+
+%struct.St4x1 = type { [1 x i32] }
+%struct.St4x2 = type { [2 x i32] }
+%struct.St4x3 = type { [3 x i32] }
+%struct.St4x4 = type { [4 x i32] }
+%struct.St4x5 = type { [5 x i32] }
+%struct.St4x6 = type { [6 x i32] }
+%struct.St4x7 = type { [7 x i32] }
+%struct.St4x8 = type { [8 x i32] }
+%struct.St8x1 = type { [1 x i64] }
+%struct.St8x2 = type { [2 x i64] }
+%struct.St8x3 = type { [3 x i64] }
+%struct.St8x4 = type { [4 x i64] }
+
+; Section 1 - checking that:
+; - function argument (including retval) vectorization is done with internal linkage;
+; - caller and callee specify correct alignment for callee's params.
+
+define dso_local void @caller_St4x1(%struct.St4x1* nocapture noundef readonly byval(%struct.St4x1) align 4 %in, %struct.St4x1* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func caller_St4x1(
+  ; CHECK:               .param .align 4 .b8 caller_St4x1_param_0[4],
+  ; CHECK:               .param .b32 caller_St4x1_param_1
+  ; CHECK:       )
+  ; CHECK:       .param .b32 param0;
+  ; CHECK:       st.param.b32 [param0+0], {{%r[0-9]+}};
+  ; CHECK:       .param .align 16 .b8 retval0[4];
+  ; CHECK:       call.uni (retval0),
+  ; CHECK-NEXT:  callee_St4x1,
+  ; CHECK-NEXT:  (
+  ; CHECK-NEXT:  param0
+  ; CHECK-NEXT:  );
+  ; CHECK:       ld.param.b32 {{%r[0-9]+}}, [retval0+0];
+  %1 = getelementptr inbounds %struct.St4x1, %struct.St4x1* %in, i64 0, i32 0, i64 0
+  %2 = load i32, i32* %1, align 4
+  %call = tail call fastcc [1 x i32] @callee_St4x1(i32 %2)
+  %.fca.0.extract = extractvalue [1 x i32] %call, 0
+  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x1, %struct.St4x1* %ret, i64 0, i32 0, i64 0
+  store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4
+  ret void
+}
+
+define internal fastcc [1 x i32] @callee_St4x1(i32 %in.0.val) {
+  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[4])
+  ; CHECK-LABEL: callee_St4x1(
+  ; CHECK-NEXT:  .param .b32 callee_St4x1_param_0
+  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [callee_St4x1_param_0];
+  ; CHECK:       st.param.b32 [func_retval0+0], [[R1]];
+  ; CHECK-NEXT:  ret;
+  %oldret = insertvalue [1 x i32] poison, i32 %in.0.val, 0
+  ret [1 x i32] %oldret
+}
+
+define dso_local void @caller_St4x2(%struct.St4x2* nocapture noundef readonly byval(%struct.St4x2) align 4 %in, %struct.St4x2* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func caller_St4x2(
+  ; CHECK:               .param .align 4 .b8 caller_St4x2_param_0[8],
+  ; CHECK:               .param .b32 caller_St4x2_param_1
+  ; CHECK:       )
+  ; CHECK:       .param .align 16 .b8 param0[8];
+  ; CHECK:       st.param.v2.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}};
+  ; CHECK:       .param .align 16 .b8 retval0[8];
+  ; CHECK:       call.uni (retval0),
+  ; CHECK-NEXT:  callee_St4x2,
+  ; CHECK-NEXT:  (
+  ; CHECK-NEXT:  param0
+  ; CHECK-NEXT:  );
+  ; CHECK:       ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0];
+  %agg.tmp = alloca i64, align 8
+  %tmpcast = bitcast i64* %agg.tmp to %struct.St4x2*
+  %1 = bitcast %struct.St4x2* %in to i64*
+  %2 = load i64, i64* %1, align 4
+  store i64 %2, i64* %agg.tmp, align 8
+  %call = tail call fastcc [2 x i32] @callee_St4x2(%struct.St4x2* noundef nonnull byval(%struct.St4x2) align 4 %tmpcast)
+  %.fca.0.extract = extractvalue [2 x i32] %call, 0
+  %.fca.1.extract = extractvalue [2 x i32] %call, 1
+  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x2, %struct.St4x2* %ret, i64 0, i32 0, i64 0
+  store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4
+  %ref.tmp.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.St4x2, %struct.St4x2* %ret, i64 0, i32 0, i64 1
+  store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx3, align 4
+  ret void
+}
+
+define internal fastcc [2 x i32] @callee_St4x2(%struct.St4x2* nocapture noundef readonly byval(%struct.St4x2) align 4 %in) {
+  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[8])
+  ; CHECK-LABEL: callee_St4x2(
+  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x2_param_0[8]
+  ; CHECK:       ld.param.v2.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x2_param_0];
+  ; CHECK:       st.param.v2.b32 [func_retval0+0], {[[R1]], [[R2]]};
+  ; CHECK-NEXT:  ret;
+  %arrayidx = getelementptr inbounds %struct.St4x2, %struct.St4x2* %in, i64 0, i32 0, i64 0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx.1 = getelementptr inbounds %struct.St4x2, %struct.St4x2* %in, i64 0, i32 0, i64 1
+  %2 = load i32, i32* %arrayidx.1, align 4
+  %3 = insertvalue [2 x i32] poison, i32 %1, 0
+  %oldret = insertvalue [2 x i32] %3, i32 %2, 1
+  ret [2 x i32] %oldret
+}
+
+define dso_local void @caller_St4x3(%struct.St4x3* nocapture noundef readonly byval(%struct.St4x3) align 4 %in, %struct.St4x3* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func caller_St4x3(
+  ; CHECK:               .param .align 4 .b8 caller_St4x3_param_0[12],
+  ; CHECK:               .param .b32 caller_St4x3_param_1
+  ; CHECK:       )
+  ; CHECK:       .param .align 16 .b8 param0[12];
+  ; CHECK:       st.param.v2.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}};
+  ; CHECK:       st.param.b32    [param0+8], {{%r[0-9]+}};
+  ; CHECK:       .param .align 16 .b8 retval0[12];
+  ; CHECK:       call.uni (retval0),
+  ; CHECK-NEXT:  callee_St4x3,
+  ; CHECK-NEXT:  (
+  ; CHECK-NEXT:  param0
+  ; CHECK-NEXT:  );
+  ; CHECK:       ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0];
+  ; CHECK:       ld.param.b32    {{%r[0-9]+}},  [retval0+8];
+  %call = tail call fastcc [3 x i32] @callee_St4x3(%struct.St4x3* noundef nonnull byval(%struct.St4x3) align 4 %in)
+  %.fca.0.extract = extractvalue [3 x i32] %call, 0
+  %.fca.1.extract = extractvalue [3 x i32] %call, 1
+  %.fca.2.extract = extractvalue [3 x i32] %call, 2
+  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 0
+  store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4
+  %ref.tmp.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 1
+  store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx2, align 4
+  %ref.tmp.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 2
+  store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx4, align 4
+  ret void
+}
+
+
+define internal fastcc [3 x i32] @callee_St4x3(%struct.St4x3* nocapture noundef readonly byval(%struct.St4x3) align 4 %in) {
+  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[12])
+  ; CHECK-LABEL: callee_St4x3(
+  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x3_param_0[12]
+  ; CHECK:       ld.param.v2.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x3_param_0];
+  ; CHECK:       ld.param.u32    [[R3:%r[0-9]+]],  [callee_St4x3_param_0+8];
+  ; CHECK:       st.param.v2.b32 [func_retval0+0], {[[R1]], [[R2]]};
+  ; CHECK:       st.param.b32    [func_retval0+8], [[R3]];
+  ; CHECK-NEXT:  ret;
+  %arrayidx = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx.1 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 1
+  %2 = load i32, i32* %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 2
+  %3 = load i32, i32* %arrayidx.2, align 4
+  %4 = insertvalue [3 x i32] poison, i32 %1, 0
+  %5 = insertvalue [3 x i32] %4, i32 %2, 1
+  %oldret = insertvalue [3 x i32] %5, i32 %3, 2
+  ret [3 x i32] %oldret
+}
+
+
+define dso_local void @caller_St4x4(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in, %struct.St4x4* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func caller_St4x4(
+  ; CHECK:               .param .align 4 .b8 caller_St4x4_param_0[16],
+  ; CHECK:               .param .b32 caller_St4x4_param_1
+  ; CHECK:       )
+  ; CHECK:       .param .align 16 .b8 param0[16];
+  ; CHECK:       st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
+  ; CHECK:       .param .align 16 .b8 retval0[16];
+  ; CHECK:       call.uni (retval0),
+  ; CHECK-NEXT:  callee_St4x4,
+  ; CHECK-NEXT:  (
+  ; CHECK-NEXT:  param0
+  ; CHECK-NEXT:  );
+  ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0];
+  %call = tail call fastcc [4 x i32] @callee_St4x4(%struct.St4x4* noundef nonnull byval(%struct.St4x4) align 4 %in)
+  %.fca.0.extract = extractvalue [4 x i32] %call, 0
+  %.fca.1.extract = extractvalue [4 x i32] %call, 1
+  %.fca.2.extract = extractvalue [4 x i32] %call, 2
+  %.fca.3.extract = extractvalue [4 x i32] %call, 3
+  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 0
+  store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4
+  %ref.tmp.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 1
+  store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx3, align 4
+  %ref.tmp.sroa.5.0..sroa_idx5 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 2
+  store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx5, align 4
+  %ref.tmp.sroa.6.0..sroa_idx7 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 3
+  store i32 %.fca.3.extract, i32* %ref.tmp.sroa.6.0..sroa_idx7, align 4
+  ret void
+}
+
+
+define internal fastcc [4 x i32] @callee_St4x4(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in) {
+  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[16])
+  ; CHECK-LABEL: callee_St4x4(
+  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x4_param_0[16]
+  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_param_0];
+  ; CHECK:       st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]};
+  ; CHECK-NEXT:  ret;
+  %arrayidx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 1
+  %2 = load i32, i32* %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 2
+  %3 = load i32, i32* %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 3
+  %4 = load i32, i32* %arrayidx.3, align 4
+  %5 = insertvalue [4 x i32] poison, i32 %1, 0
+  %6 = insertvalue [4 x i32] %5, i32 %2, 1
+  %7 = insertvalue [4 x i32] %6, i32 %3, 2
+  %oldret = insertvalue [4 x i32] %7, i32 %4, 3
+  ret [4 x i32] %oldret
+}
+
+
+define dso_local void @caller_St4x5(%struct.St4x5* nocapture noundef readonly byval(%struct.St4x5) align 4 %in, %struct.St4x5* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func caller_St4x5(
+  ; CHECK:               .param .align 4 .b8 caller_St4x5_param_0[20],
+  ; CHECK:               .param .b32 caller_St4x5_param_1
+  ; CHECK:       )
+  ; CHECK:       .param .align 16 .b8 param0[20];
+  ; CHECK:       st.param.v4.b32 [param0+0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
+  ; CHECK:       st.param.b32    [param0+16], {{%r[0-9]+}};
+  ; CHECK:       .param .align 16 .b8 retval0[20];
+  ; CHECK:       call.uni (retval0),
+  ; CHECK-NEXT:  callee_St4x5,
+  ; CHECK-NEXT:  (
+  ; CHECK-NEXT:  param0
+  ; CHECK-NEXT:  );
+  ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0];
+  ; CHECK:       ld.param.b32    {{%r[0-9]+}},  [retval0+16];
+  %call = tail call fastcc [5 x i32] @callee_St4x5(%struct.St4x5* noundef nonnull byval(%struct.St4x5) align 4 %in)
+  %.fca.0.extract = extractvalue [5 x i32] %call, 0
+  %.fca.1.extract = extractvalue [5 x i32] %call, 1
+  %.fca.2.extract = extractvalue [5 x i32] %call, 2
+  %.fca.3.extract = extractvalue [5 x i32] %call, 3
+  %.fca.4.extract = extractvalue [5 x i32] %call, 4
+  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 0
+  store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4
+  %ref.tmp.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 1
+  store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx3, align 4
+  %ref.tmp.sroa.5.0..sroa_idx5 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 2
+  store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx5, align 4
+  %ref.tmp.sroa.6.0..sroa_idx7 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 3
+  store i32 %.fca.3.extract, i32* %ref.tmp.sroa.6.0..sroa_idx7, align 4
+  %ref.tmp.sroa.7.0..sroa_idx9 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 4
+  store i32 %.fca.4.extract, i32* %ref.tmp.sroa.7.0..sroa_idx9, align 4
+  ret void
+}
+
+
+define internal fastcc [5 x i32] @callee_St4x5(%struct.St4x5* nocapture noundef readonly byval(%struct.St4x5) align 4 %in) {
+  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[20])
+  ; CHECK-LABEL: callee_St4x5(
+  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x5_param_0[20]
+  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x5_param_0];
+  ; CHECK:       ld.param.u32    [[R5:%r[0-9]+]],   [callee_St4x5_param_0+16];
+  ; CHECK:       st.param.v4.b32 [func_retval0+0],  {[[R1]], [[R2]], [[R3]], [[R4]]};
+  ; CHECK:       st.param.b32    [func_retval0+16], [[R5]];
+  ; CHECK-NEXT:  ret;
+  %arrayidx = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx.1 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 1
+  %2 = load i32, i32* %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 2
+  %3 = load i32, i32* %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 3
+  %4 = load i32, i32* %arrayidx.3, align 4
+  %arrayidx.4 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 4
+  %5 = load i32, i32* %arrayidx.4, align 4
+  %6 = insertvalue [5 x i32] poison, i32 %1, 0
+  %7 = insertvalue [5 x i32] %6, i32 %2, 1
+  %8 = insertvalue [5 x i32] %7, i32 %3, 2
+  %9 = insertvalue [5 x i32] %8, i32 %4, 3
+  %oldret = insertvalue [5 x i32] %9, i32 %5, 4
+  ret [5 x i32] %oldret
+}
+
+
+define dso_local void @caller_St4x6(%struct.St4x6* nocapture noundef readonly byval(%struct.St4x6) align 4 %in, %struct.St4x6* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func caller_St4x6(
+  ; CHECK:               .param .align 4 .b8 caller_St4x6_param_0[24],
+  ; CHECK:               .param .b32 caller_St4x6_param_1
+  ; CHECK:       )
+  ; CHECK:       .param .align 16 .b8 param0[24];
+  ; CHECK:       st.param.v4.b32 [param0+0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
+  ; CHECK:       st.param.v2.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}};
+  ; CHECK:       .param .align 16 .b8 retval0[24];
+  ; CHECK:       call.uni (retval0),
+  ; CHECK-NEXT:  callee_St4x6,
+  ; CHECK-NEXT:  (
+  ; CHECK-NEXT:  param0
+  ; CHECK-NEXT:  );
+  ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0];
+  ; CHECK:       ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16];
+  %call = tail call fastcc [6 x i32] @callee_St4x6(%struct.St4x6* noundef nonnull byval(%struct.St4x6) align 4 %in)
+  %.fca.0.extract = extractvalue [6 x i32] %call, 0
+  %.fca.1.extract = extractvalue [6 x i32] %call, 1
+  %.fca.2.extract = extractvalue [6 x i32] %call, 2
+  %.fca.3.extract = extractvalue [6 x i32] %call, 3
+  %.fca.4.extract = extractvalue [6 x i32] %call, 4
+  %.fca.5.extract = extractvalue [6 x i32] %call, 5
+  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 0
+  store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4
+  %ref.tmp.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 1
+  store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx2, align 4
+  %ref.tmp.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 2
+  store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx4, align 4
+  %ref.tmp.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 3
+  store i32 %.fca.3.extract, i32* %ref.tmp.sroa.6.0..sroa_idx6, align 4
+  %ref.tmp.sroa.7.0..sroa_idx8 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 4
+  store i32 %.fca.4.extract, i32* %ref.tmp.sroa.7.0..sroa_idx8, align 4
+  %ref.tmp.sroa.8.0..sroa_idx10 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 5
+  store i32 %.fca.5.extract, i32* %ref.tmp.sroa.8.0..sroa_idx10, align 4
+  ret void
+}
+
+
+define internal fastcc [6 x i32] @callee_St4x6(%struct.St4x6* nocapture noundef readonly byval(%struct.St4x6) align 4 %in) {
+  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[24])
+  ; CHECK-LABEL: callee_St4x6(
+  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x6_param_0[24]
+  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x6_param_0];
+  ; CHECK:       ld.param.v2.u32 {[[R5:%r[0-9]+]],  [[R6:%r[0-9]+]]}, [callee_St4x6_param_0+16];
+  ; CHECK:       st.param.v4.b32 [func_retval0+0],  {[[R1]], [[R2]], [[R3]], [[R4]]};
+  ; CHECK:       st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]};
+  ; CHECK-NEXT:  ret;
+  %arrayidx = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx.1 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 1
+  %2 = load i32, i32* %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 2
+  %3 = load i32, i32* %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 3
+  %4 = load i32, i32* %arrayidx.3, align 4
+  %arrayidx.4 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 4
+  %5 = load i32, i32* %arrayidx.4, align 4
+  %arrayidx.5 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 5
+  %6 = load i32, i32* %arrayidx.5, align 4
+  %7 = insertvalue [6 x i32] poison, i32 %1, 0
+  %8 = insertvalue [6 x i32] %7, i32 %2, 1
+  %9 = insertvalue [6 x i32] %8, i32 %3, 2
+  %10 = insertvalue [6 x i32] %9, i32 %4, 3
+  %11 = insertvalue [6 x i32] %10, i32 %5, 4
+  %oldret = insertvalue [6 x i32] %11, i32 %6, 5
+  ret [6 x i32] %oldret
+}
+
+
+define dso_local void @caller_St4x7(%struct.St4x7* nocapture noundef readonly byval(%struct.St4x7) align 4 %in, %struct.St4x7* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func caller_St4x7(
+  ; CHECK:               .param .align 4 .b8 caller_St4x7_param_0[28],
+  ; CHECK:               .param .b32 caller_St4x7_param_1
+  ; CHECK:       )
+  ; CHECK:       .param .align 16 .b8 param0[28];
+  ; CHECK:       st.param.v4.b32 [param0+0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
+  ; CHECK:       st.param.v2.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}};
+  ; CHECK:       st.param.b32    [param0+24], {{%r[0-9]+}};
+  ; CHECK:       .param .align 16 .b8 retval0[28];
+  ; CHECK:       call.uni (retval0),
+  ; CHECK-NEXT:  callee_St4x7,
+  ; CHECK-NEXT:  (
+  ; CHECK-NEXT:  param0
+  ; CHECK-NEXT:  );
+  ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0];
+  ; CHECK:       ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16];
+  ; CHECK:       ld.param.b32    {{%r[0-9]+}}, [retval0+24];
+  %call = tail call fastcc [7 x i32] @callee_St4x7(%struct.St4x7* noundef nonnull byval(%struct.St4x7) align 4 %in)
+  %.fca.0.extract = extractvalue [7 x i32] %call, 0
+  %.fca.1.extract = extractvalue [7 x i32] %call, 1
+  %.fca.2.extract = extractvalue [7 x i32] %call, 2
+  %.fca.3.extract = extractvalue [7 x i32] %call, 3
+  %.fca.4.extract = extractvalue [7 x i32] %call, 4
+  %.fca.5.extract = extractvalue [7 x i32] %call, 5
+  %.fca.6.extract = extractvalue [7 x i32] %call, 6
+  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 0
+  store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4
+  %ref.tmp.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 1
+  store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx2, align 4
+  %ref.tmp.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 2
+  store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx4, align 4
+  %ref.tmp.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 3
+  store i32 %.fca.3.extract, i32* %ref.tmp.sroa.6.0..sroa_idx6, align 4
+  %ref.tmp.sroa.7.0..sroa_idx8 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 4
+  store i32 %.fca.4.extract, i32* %ref.tmp.sroa.7.0..sroa_idx8, align 4
+  %ref.tmp.sroa.8.0..sroa_idx10 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 5
+  store i32 %.fca.5.extract, i32* %ref.tmp.sroa.8.0..sroa_idx10, align 4
+  %ref.tmp.sroa.9.0..sroa_idx12 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 6
+  store i32 %.fca.6.extract, i32* %ref.tmp.sroa.9.0..sroa_idx12, align 4
+  ret void
+}
+
+
+define internal fastcc [7 x i32] @callee_St4x7(%struct.St4x7* nocapture noundef readonly byval(%struct.St4x7) align 4 %in) {
+  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[28])
+  ; CHECK-LABEL: callee_St4x7(
+  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x7_param_0[28]
+  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0];
+  ; CHECK:       ld.param.v2.u32 {[[R5:%r[0-9]+]],  [[R6:%r[0-9]+]]}, [callee_St4x7_param_0+16];
+  ; CHECK:       ld.param.u32    [[R7:%r[0-9]+]],   [callee_St4x7_param_0+24];
+  ; CHECK:       st.param.v4.b32 [func_retval0+0],  {[[R1]], [[R2]], [[R3]], [[R4]]};
+  ; CHECK:       st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]};
+  ; CHECK:       st.param.b32    [func_retval0+24], [[R7]];
+  ; CHECK-NEXT:  ret;
+  %arrayidx = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx.1 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 1
+  %2 = load i32, i32* %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 2
+  %3 = load i32, i32* %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 3
+  %4 = load i32, i32* %arrayidx.3, align 4
+  %arrayidx.4 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 4
+  %5 = load i32, i32* %arrayidx.4, align 4
+  %arrayidx.5 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 5
+  %6 = load i32, i32* %arrayidx.5, align 4
+  %arrayidx.6 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 6
+  %7 = load i32, i32* %arrayidx.6, align 4
+  %8 = insertvalue [7 x i32] poison, i32 %1, 0
+  %9 = insertvalue [7 x i32] %8, i32 %2, 1
+  %10 = insertvalue [7 x i32] %9, i32 %3, 2
+  %11 = insertvalue [7 x i32] %10, i32 %4, 3
+  %12 = insertvalue [7 x i32] %11, i32 %5, 4
+  %13 = insertvalue [7 x i32] %12, i32 %6, 5
+  %oldret = insertvalue [7 x i32] %13, i32 %7, 6
+  ret [7 x i32] %oldret
+}
+
+
+define dso_local void @caller_St4x8(%struct.St4x8* nocapture noundef readonly byval(%struct.St4x8) align 4 %in, %struct.St4x8* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func caller_St4x8(
+  ; CHECK:               .param .align 4 .b8 caller_St4x8_param_0[32],
+  ; CHECK:               .param .b32 caller_St4x8_param_1
+  ; CHECK:       )
+  ; CHECK:       .param .align 16 .b8 param0[32];
+  ; CHECK:       st.param.v4.b32 [param0+0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
+  ; CHECK:       st.param.v4.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
+  ; CHECK:       .param .align 16 .b8 retval0[32];
+  ; CHECK:       call.uni (retval0),
+  ; CHECK-NEXT:  callee_St4x8,
+  ; CHECK-NEXT:  (
+  ; CHECK-NEXT:  param0
+  ; CHECK-NEXT:  );
+  ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0];
+  ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16];
+  %call = tail call fastcc [8 x i32] @callee_St4x8(%struct.St4x8* noundef nonnull byval(%struct.St4x8) align 4 %in)
+  %.fca.0.extract = extractvalue [8 x i32] %call, 0
+  %.fca.1.extract = extractvalue [8 x i32] %call, 1
+  %.fca.2.extract = extractvalue [8 x i32] %call, 2
+  %.fca.3.extract = extractvalue [8 x i32] %call, 3
+  %.fca.4.extract = extractvalue [8 x i32] %call, 4
+  %.fca.5.extract = extractvalue [8 x i32] %call, 5
+  %.fca.6.extract = extractvalue [8 x i32] %call, 6
+  %.fca.7.extract = extractvalue [8 x i32] %call, 7
+  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 0
+  store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4
+  %ref.tmp.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 1
+  store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx2, align 4
+  %ref.tmp.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 2
+  store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx4, align 4
+  %ref.tmp.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 3
+  store i32 %.fca.3.extract, i32* %ref.tmp.sroa.6.0..sroa_idx6, align 4
+  %ref.tmp.sroa.7.0..sroa_idx8 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 4
+  store i32 %.fca.4.extract, i32* %ref.tmp.sroa.7.0..sroa_idx8, align 4
+  %ref.tmp.sroa.8.0..sroa_idx10 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 5
+  store i32 %.fca.5.extract, i32* %ref.tmp.sroa.8.0..sroa_idx10, align 4
+  %ref.tmp.sroa.9.0..sroa_idx12 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 6
+  store i32 %.fca.6.extract, i32* %ref.tmp.sroa.9.0..sroa_idx12, align 4
+  %ref.tmp.sroa.10.0..sroa_idx14 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 7
+  store i32 %.fca.7.extract, i32* %ref.tmp.sroa.10.0..sroa_idx14, align 4
+  ret void
+}
+
+
+define internal fastcc [8 x i32] @callee_St4x8(%struct.St4x8* nocapture noundef readonly byval(%struct.St4x8) align 4 %in) {
+  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[32])
+  ; CHECK-LABEL: callee_St4x8(
+  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x8_param_0[32]
+  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x8_param_0];
+  ; CHECK:       ld.param.v4.u32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], [[R8:%r[0-9]+]]}, [callee_St4x8_param_0+16];
+  ; CHECK:       st.param.v4.b32 [func_retval0+0],  {[[R1]], [[R2]], [[R3]], [[R4]]};
+  ; CHECK:       st.param.v4.b32 [func_retval0+16], {[[R5]], [[R6]], [[R7]], [[R8]]};
+  ; CHECK-NEXT:  ret;
+  %arrayidx = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx.1 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 1
+  %2 = load i32, i32* %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 2
+  %3 = load i32, i32* %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 3
+  %4 = load i32, i32* %arrayidx.3, align 4
+  %arrayidx.4 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 4
+  %5 = load i32, i32* %arrayidx.4, align 4
+  %arrayidx.5 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 5
+  %6 = load i32, i32* %arrayidx.5, align 4
+  %arrayidx.6 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 6
+  %7 = load i32, i32* %arrayidx.6, align 4
+  %arrayidx.7 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 7
+  %8 = load i32, i32* %arrayidx.7, align 4
+  %9 = insertvalue [8 x i32] poison, i32 %1, 0
+  %10 = insertvalue [8 x i32] %9, i32 %2, 1
+  %11 = insertvalue [8 x i32] %10, i32 %3, 2
+  %12 = insertvalue [8 x i32] %11, i32 %4, 3
+  %13 = insertvalue [8 x i32] %12, i32 %5, 4
+  %14 = insertvalue [8 x i32] %13, i32 %6, 5
+  %15 = insertvalue [8 x i32] %14, i32 %7, 6
+  %oldret = insertvalue [8 x i32] %15, i32 %8, 7
+  ret [8 x i32] %oldret
+}
+
+
+define dso_local void @caller_St8x1(%struct.St8x1* nocapture noundef readonly byval(%struct.St8x1) align 8 %in, %struct.St8x1* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func caller_St8x1(
+  ; CHECK:               .param .align 8 .b8 caller_St8x1_param_0[8],
+  ; CHECK:               .param .b32 caller_St8x1_param_1
+  ; CHECK:       )
+  ; CHECK:       .param .b64 param0;
+  ; CHECK:       st.param.b64 [param0+0], {{%rd[0-9]+}};
+  ; CHECK:       .param .align 16 .b8 retval0[8];
+  ; CHECK:       call.uni (retval0),
+  ; CHECK-NEXT:  callee_St8x1,
+  ; CHECK-NEXT:  (
+  ; CHECK-NEXT:  param0
+  ; CHECK-NEXT:  );
+  ; CHECK:       ld.param.b64 {{%rd[0-9]+}}, [retval0+0];
+  %1 = getelementptr inbounds %struct.St8x1, %struct.St8x1* %in, i64 0, i32 0, i64 0
+  %2 = load i64, i64* %1, align 8
+  %call = tail call fastcc [1 x i64] @callee_St8x1(i64 %2)
+  %.fca.0.extract = extractvalue [1 x i64] %call, 0
+  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St8x1, %struct.St8x1* %ret, i64 0, i32 0, i64 0
+  store i64 %.fca.0.extract, i64* %ref.tmp.sroa.0.0..sroa_idx, align 8
+  ret void
+}
+
+
+define internal fastcc [1 x i64] @callee_St8x1(i64 %in.0.val) {
+  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[8])
+  ; CHECK-LABEL: callee_St8x1(
+  ; CHECK-NEXT:  .param .b64 callee_St8x1_param_0
+  ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [callee_St8x1_param_0];
+  ; CHECK:       st.param.b64 [func_retval0+0],  [[RD1]];
+  ; CHECK-NEXT:  ret;
+  %oldret = insertvalue [1 x i64] poison, i64 %in.0.val, 0
+  ret [1 x i64] %oldret
+}
+
+
+define dso_local void @caller_St8x2(%struct.St8x2* nocapture noundef readonly byval(%struct.St8x2) align 8 %in, %struct.St8x2* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func caller_St8x2(
+  ; CHECK:               .param .align 8 .b8 caller_St8x2_param_0[16],
+  ; CHECK:               .param .b32 caller_St8x2_param_1
+  ; CHECK:       )
+  ; CHECK:       .param .align 16 .b8 param0[16];
+  ; CHECK:       st.param.v2.b64 [param0+0],  {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
+  ; CHECK:       .param .align 16 .b8 retval0[16];
+  ; CHECK:       call.uni (retval0),
+  ; CHECK-NEXT:  callee_St8x2,
+  ; CHECK-NEXT:  (
+  ; CHECK-NEXT:  param0
+  ; CHECK-NEXT:  );
+  ; CHECK:       ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+0];
+  %call = tail call fastcc [2 x i64] @callee_St8x2(%struct.St8x2* noundef nonnull byval(%struct.St8x2) align 8 %in)
+  %.fca.0.extract = extractvalue [2 x i64] %call, 0
+  %.fca.1.extract = extractvalue [2 x i64] %call, 1
+  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St8x2, %struct.St8x2* %ret, i64 0, i32 0, i64 0
+  store i64 %.fca.0.extract, i64* %ref.tmp.sroa.0.0..sroa_idx, align 8
+  %ref.tmp.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.St8x2, %struct.St8x2* %ret, i64 0, i32 0, i64 1
+  store i64 %.fca.1.extract, i64* %ref.tmp.sroa.4.0..sroa_idx3, align 8
+  ret void
+}
+
+
+define internal fastcc [2 x i64] @callee_St8x2(%struct.St8x2* nocapture noundef readonly byval(%struct.St8x2) align 8 %in) {
+  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[16])
+  ; CHECK-LABEL: callee_St8x2(
+  ; CHECK-NEXT:  .param .align 16 .b8 callee_St8x2_param_0[16]
+  ; CHECK:       ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x2_param_0];
+  ; CHECK:       st.param.v2.b64 [func_retval0+0], {[[RD1]], [[RD2]]};
+  ; CHECK-NEXT:  ret;
+  %arrayidx = getelementptr inbounds %struct.St8x2, %struct.St8x2* %in, i64 0, i32 0, i64 0
+  %1 = load i64, i64* %arrayidx, align 8
+  %arrayidx.1 = getelementptr inbounds %struct.St8x2, %struct.St8x2* %in, i64 0, i32 0, i64 1
+  %2 = load i64, i64* %arrayidx.1, align 8
+  %3 = insertvalue [2 x i64] poison, i64 %1, 0
+  %oldret = insertvalue [2 x i64] %3, i64 %2, 1
+  ret [2 x i64] %oldret
+}
+
+
+define dso_local void @caller_St8x3(%struct.St8x3* nocapture noundef readonly byval(%struct.St8x3) align 8 %in, %struct.St8x3* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func caller_St8x3(
+  ; CHECK:               .param .align 8 .b8 caller_St8x3_param_0[24],
+  ; CHECK:               .param .b32 caller_St8x3_param_1
+  ; CHECK:       )
+  ; CHECK:       .param .align 16 .b8 param0[24];
+  ; CHECK:       st.param.v2.b64 [param0+0],  {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
+  ; CHECK:       st.param.b64    [param0+16], {{%rd[0-9]+}};
+  ; CHECK:       .param .align 16 .b8 retval0[24];
+  ; CHECK:       call.uni (retval0),
+  ; CHECK-NEXT:  callee_St8x3,
+  ; CHECK-NEXT:  (
+  ; CHECK-NEXT:  param0
+  ; CHECK-NEXT:  );
+  ; CHECK:       ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+0];
+  ; CHECK:       ld.param.b64    {{%rd[0-9]+}}, [retval0+16];
+  %call = tail call fastcc [3 x i64] @callee_St8x3(%struct.St8x3* noundef nonnull byval(%struct.St8x3) align 8 %in)
+  %.fca.0.extract = extractvalue [3 x i64] %call, 0
+  %.fca.1.extract = extractvalue [3 x i64] %call, 1
+  %.fca.2.extract = extractvalue [3 x i64] %call, 2
+  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 0
+  store i64 %.fca.0.extract, i64* %ref.tmp.sroa.0.0..sroa_idx, align 8
+  %ref.tmp.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 1
+  store i64 %.fca.1.extract, i64* %ref.tmp.sroa.4.0..sroa_idx2, align 8
+  %ref.tmp.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 2
+  store i64 %.fca.2.extract, i64* %ref.tmp.sroa.5.0..sroa_idx4, align 8
+  ret void
+}
+
+
+define internal fastcc [3 x i64] @callee_St8x3(%struct.St8x3* nocapture noundef readonly byval(%struct.St8x3) align 8 %in) {
+  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[24])
+  ; CHECK-LABEL: callee_St8x3(
+  ; CHECK-NEXT:  .param .align 16 .b8 callee_St8x3_param_0[24]
+  ; CHECK:       ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x3_param_0];
+  ; CHECK:       ld.param.u64    [[RD3:%rd[0-9]+]],  [callee_St8x3_param_0+16];
+  ; CHECK:       st.param.v2.b64 [func_retval0+0],   {[[RD1]], [[RD2]]};
+  ; CHECK:       st.param.b64    [func_retval0+16],  [[RD3]];
+  ; CHECK-NEXT:  ret;
+  %arrayidx = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 0
+  %1 = load i64, i64* %arrayidx, align 8
+  %arrayidx.1 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 1
+  %2 = load i64, i64* %arrayidx.1, align 8
+  %arrayidx.2 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 2
+  %3 = load i64, i64* %arrayidx.2, align 8
+  %4 = insertvalue [3 x i64] poison, i64 %1, 0
+  %5 = insertvalue [3 x i64] %4, i64 %2, 1
+  %oldret = insertvalue [3 x i64] %5, i64 %3, 2
+  ret [3 x i64] %oldret
+}
+
+
+define dso_local void @caller_St8x4(%struct.St8x4* nocapture noundef readonly byval(%struct.St8x4) align 8 %in, %struct.St8x4* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func caller_St8x4(
+  ; CHECK:               .param .align 8 .b8 caller_St8x4_param_0[32],
+  ; CHECK:               .param .b32 caller_St8x4_param_1
+  ; CHECK:       )
+  ; CHECK:       .param .align 16 .b8 param0[32];
+  ; CHECK:       st.param.v2.b64 [param0+0],  {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
+  ; CHECK:       st.param.v2.b64 [param0+16], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
+  ; CHECK:       .param .align 16 .b8 retval0[32];
+  ; CHECK:       call.uni (retval0),
+  ; CHECK-NEXT:  callee_St8x4,
+  ; CHECK-NEXT:  (
+  ; CHECK-NEXT:  param0
+  ; CHECK-NEXT:  );
+  ; CHECK:       ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+0];
+  ; CHECK:       ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+16];
+  %call = tail call fastcc [4 x i64] @callee_St8x4(%struct.St8x4* noundef nonnull byval(%struct.St8x4) align 8 %in)
+  %.fca.0.extract = extractvalue [4 x i64] %call, 0
+  %.fca.1.extract = extractvalue [4 x i64] %call, 1
+  %.fca.2.extract = extractvalue [4 x i64] %call, 2
+  %.fca.3.extract = extractvalue [4 x i64] %call, 3
+  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 0
+  store i64 %.fca.0.extract, i64* %ref.tmp.sroa.0.0..sroa_idx, align 8
+  %ref.tmp.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 1
+  store i64 %.fca.1.extract, i64* %ref.tmp.sroa.4.0..sroa_idx3, align 8
+  %ref.tmp.sroa.5.0..sroa_idx5 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 2
+  store i64 %.fca.2.extract, i64* %ref.tmp.sroa.5.0..sroa_idx5, align 8
+  %ref.tmp.sroa.6.0..sroa_idx7 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 3
+  store i64 %.fca.3.extract, i64* %ref.tmp.sroa.6.0..sroa_idx7, align 8
+  ret void
+}
+
+
+define internal fastcc [4 x i64] @callee_St8x4(%struct.St8x4* nocapture noundef readonly byval(%struct.St8x4) align 8 %in) {
+  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[32])
+  ; CHECK-LABEL: callee_St8x4(
+  ; CHECK-NEXT:  .param .align 16 .b8 callee_St8x4_param_0[32]
+  ; CHECK:       ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x4_param_0];
+  ; CHECK:       ld.param.v2.u64 {[[RD3:%rd[0-9]+]], [[RD4:%rd[0-9]+]]}, [callee_St8x4_param_0+16];
+  ; CHECK:       st.param.v2.b64 [func_retval0+0],  {[[RD1]], [[RD2]]};
+  ; CHECK:       st.param.v2.b64 [func_retval0+16], {[[RD3]], [[RD4]]};
+  ; CHECK-NEXT:  ret;
+  %arrayidx = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 0
+  %1 = load i64, i64* %arrayidx, align 8
+  %arrayidx.1 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 1
+  %2 = load i64, i64* %arrayidx.1, align 8
+  %arrayidx.2 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 2
+  %3 = load i64, i64* %arrayidx.2, align 8
+  %arrayidx.3 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 3
+  %4 = load i64, i64* %arrayidx.3, align 8
+  %5 = insertvalue [4 x i64] poison, i64 %1, 0
+  %6 = insertvalue [4 x i64] %5, i64 %2, 1
+  %7 = insertvalue [4 x i64] %6, i64 %3, 2
+  %oldret = insertvalue [4 x i64] %7, i64 %4, 3
+  ret [4 x i64] %oldret
+}
+
+; Section 2 - checking that function argument (including retval) vectorization is done with private linkage.
+
+define private fastcc [4 x i32] @callee_St4x4_private(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in) {
+  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[16])
+  ; CHECK-LABEL: callee_St4x4_private(
+  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x4_private_param_0[16]
+  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_private_param_0];
+  ; CHECK:       st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]};
+  ; CHECK-NEXT:  ret;
+  %arrayidx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 1
+  %2 = load i32, i32* %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 2
+  %3 = load i32, i32* %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 3
+  %4 = load i32, i32* %arrayidx.3, align 4
+  %5 = insertvalue [4 x i32] poison, i32 %1, 0
+  %6 = insertvalue [4 x i32] %5, i32 %2, 1
+  %7 = insertvalue [4 x i32] %6, i32 %3, 2
+  %oldret = insertvalue [4 x i32] %7, i32 %4, 3
+  ret [4 x i32] %oldret
+}
+
+; Section 3 - checking that function argument (including retval) vectorization
+; is NOT done with linkage types other than internal and private.
+
+define external fastcc [4 x i32] @callee_St4x4_external(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in) {
+  ; CHECK:       .func  (.param .align 4 .b8 func_retval0[16])
+  ; CHECK-LABEL: callee_St4x4_external(
+  ; CHECK-NEXT:  .param .align 4 .b8 callee_St4x4_external_param_0[16]
+  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]],   [callee_St4x4_external_param_0];
+  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]],   [callee_St4x4_external_param_0+4];
+  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]],   [callee_St4x4_external_param_0+8];
+  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]],   [callee_St4x4_external_param_0+12];
+  ; CHECK:       st.param.b32 [func_retval0+0],  [[R1]];
+  ; CHECK:       st.param.b32 [func_retval0+4],  [[R2]];
+  ; CHECK:       st.param.b32 [func_retval0+8],  [[R3]];
+  ; CHECK:       st.param.b32 [func_retval0+12], [[R4]];
+  ; CHECK-NEXT:  ret;
+  %arrayidx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 1
+  %2 = load i32, i32* %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 2
+  %3 = load i32, i32* %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 3
+  %4 = load i32, i32* %arrayidx.3, align 4
+  %5 = insertvalue [4 x i32] poison, i32 %1, 0
+  %6 = insertvalue [4 x i32] %5, i32 %2, 1
+  %7 = insertvalue [4 x i32] %6, i32 %3, 2
+  %oldret = insertvalue [4 x i32] %7, i32 %4, 3
+  ret [4 x i32] %oldret
+}

diff  --git a/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll
new file mode 100644
index 0000000000000..4a1ed8f4dcda7
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll
@@ -0,0 +1,456 @@
+; RUN: llc < %s -mtriple=nvptx-unknown-unknown | FileCheck %s
+;
+; Check that parameters of a __global__ (kernel) function do not get increased
+; alignment, and no additional vectorization is performed on loads/stores with
+; that parameters.
+;
+; Test IR is a minimized version of IR generated with the following command
+; from the source code below:
+; $ clang++ -O3 --cuda-gpu-arch=sm_35 -S -emit-llvm src.cu
+;
+; ----------------------------------------------------------------------------
+; #include <stdint.h>
+;
+; struct St4x1 { uint32_t field[1]; };
+; struct St4x2 { uint32_t field[2]; };
+; struct St4x3 { uint32_t field[3]; };
+; struct St4x4 { uint32_t field[4]; };
+; struct St4x5 { uint32_t field[5]; };
+; struct St4x6 { uint32_t field[6]; };
+; struct St4x7 { uint32_t field[7]; };
+; struct St4x8 { uint32_t field[8]; };
+; struct St8x1 { uint64_t field[1]; };
+; struct St8x2 { uint64_t field[2]; };
+; struct St8x3 { uint64_t field[3]; };
+; struct St8x4 { uint64_t field[4]; };
+;
+; #define DECLARE_FUNCTION(StName)                                    \
+; static __global__  __attribute__((noinline))                        \
+; void foo_##StName(struct StName in, struct StName* ret) {           \
+;   const unsigned size = sizeof(ret->field) / sizeof(*ret->field);   \
+;   for (unsigned i = 0; i != size; ++i)                              \
+;     ret->field[i] = in.field[i];                                    \
+; }                                                                   \
+;
+; DECLARE_FUNCTION(St4x1)
+; DECLARE_FUNCTION(St4x2)
+; DECLARE_FUNCTION(St4x3)
+; DECLARE_FUNCTION(St4x4)
+; DECLARE_FUNCTION(St4x5)
+; DECLARE_FUNCTION(St4x6)
+; DECLARE_FUNCTION(St4x7)
+; DECLARE_FUNCTION(St4x8)
+; DECLARE_FUNCTION(St8x1)
+; DECLARE_FUNCTION(St8x2)
+; DECLARE_FUNCTION(St8x3)
+; DECLARE_FUNCTION(St8x4)
+; ----------------------------------------------------------------------------
+
+%struct.St4x1 = type { [1 x i32] }
+%struct.St4x2 = type { [2 x i32] }
+%struct.St4x3 = type { [3 x i32] }
+%struct.St4x4 = type { [4 x i32] }
+%struct.St4x5 = type { [5 x i32] }
+%struct.St4x6 = type { [6 x i32] }
+%struct.St4x7 = type { [7 x i32] }
+%struct.St4x8 = type { [8 x i32] }
+%struct.St8x1 = type { [1 x i64] }
+%struct.St8x2 = type { [2 x i64] }
+%struct.St8x3 = type { [3 x i64] }
+%struct.St8x4 = type { [4 x i64] }
+
+define dso_local void @foo_St4x1(%struct.St4x1* nocapture noundef readonly byval(%struct.St4x1) align 4 %in, %struct.St4x1* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func foo_St4x1(
+  ; CHECK:               .param .align 4 .b8 foo_St4x1_param_0[4],
+  ; CHECK:               .param .b32 foo_St4x1_param_1
+  ; CHECK:       )
+  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x1_param_1];
+  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x1_param_0];
+  ; CHECK:       st.u32  [[[R1]]], [[R2]];
+  ; CHECK:       ret;
+  %arrayidx = getelementptr inbounds %struct.St4x1, %struct.St4x1* %in, i64 0, i32 0, i64 0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds %struct.St4x1, %struct.St4x1* %ret, i64 0, i32 0, i64 0
+  store i32 %1, i32* %arrayidx3, align 4
+  ret void
+}
+
+define dso_local void @foo_St4x2(%struct.St4x2* nocapture noundef readonly byval(%struct.St4x2) align 4 %in, %struct.St4x2* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func foo_St4x2(
+  ; CHECK:               .param .align 4 .b8 foo_St4x2_param_0[8],
+  ; CHECK:               .param .b32 foo_St4x2_param_1
+  ; CHECK:       )
+  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x2_param_1];
+  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x2_param_0];
+  ; CHECK:       st.u32  [[[R1]]], [[R2]];
+  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x2_param_0+4];
+  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
+  ; CHECK:       ret;
+  %arrayidx = getelementptr inbounds %struct.St4x2, %struct.St4x2* %in, i64 0, i32 0, i64 0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds %struct.St4x2, %struct.St4x2* %ret, i64 0, i32 0, i64 0
+  store i32 %1, i32* %arrayidx3, align 4
+  %arrayidx.1 = getelementptr inbounds %struct.St4x2, %struct.St4x2* %in, i64 0, i32 0, i64 1
+  %2 = load i32, i32* %arrayidx.1, align 4
+  %arrayidx3.1 = getelementptr inbounds %struct.St4x2, %struct.St4x2* %ret, i64 0, i32 0, i64 1
+  store i32 %2, i32* %arrayidx3.1, align 4
+  ret void
+}
+
+define dso_local void @foo_St4x3(%struct.St4x3* nocapture noundef readonly byval(%struct.St4x3) align 4 %in, %struct.St4x3* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func foo_St4x3(
+  ; CHECK:               .param .align 4 .b8 foo_St4x3_param_0[12],
+  ; CHECK:               .param .b32 foo_St4x3_param_1
+  ; CHECK:       )
+  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x3_param_1];
+  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x3_param_0];
+  ; CHECK:       st.u32  [[[R1]]], [[R2]];
+  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x3_param_0+4];
+  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
+  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x3_param_0+8];
+  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
+  ; CHECK:       ret;
+  %arrayidx = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 0
+  store i32 %1, i32* %arrayidx3, align 4
+  %arrayidx.1 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 1
+  %2 = load i32, i32* %arrayidx.1, align 4
+  %arrayidx3.1 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 1
+  store i32 %2, i32* %arrayidx3.1, align 4
+  %arrayidx.2 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 2
+  %3 = load i32, i32* %arrayidx.2, align 4
+  %arrayidx3.2 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 2
+  store i32 %3, i32* %arrayidx3.2, align 4
+  ret void
+}
+
+define dso_local void @foo_St4x4(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in, %struct.St4x4* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func foo_St4x4(
+  ; CHECK:               .param .align 4 .b8 foo_St4x4_param_0[16],
+  ; CHECK:               .param .b32 foo_St4x4_param_1
+  ; CHECK:       )
+  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x4_param_1];
+  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x4_param_0];
+  ; CHECK:       st.u32  [[[R1]]], [[R2]];
+  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x4_param_0+4];
+  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
+  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x4_param_0+8];
+  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
+  ; CHECK:       ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x4_param_0+12];
+  ; CHECK:       st.u32  [[[R1]]+12], [[R5]];
+  ; CHECK:       ret;
+  %arrayidx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 0
+  store i32 %1, i32* %arrayidx3, align 4
+  %arrayidx.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 1
+  %2 = load i32, i32* %arrayidx.1, align 4
+  %arrayidx3.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 1
+  store i32 %2, i32* %arrayidx3.1, align 4
+  %arrayidx.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 2
+  %3 = load i32, i32* %arrayidx.2, align 4
+  %arrayidx3.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 2
+  store i32 %3, i32* %arrayidx3.2, align 4
+  %arrayidx.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 3
+  %4 = load i32, i32* %arrayidx.3, align 4
+  %arrayidx3.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 3
+  store i32 %4, i32* %arrayidx3.3, align 4
+  ret void
+}
+
+define dso_local void @foo_St4x5(%struct.St4x5* nocapture noundef readonly byval(%struct.St4x5) align 4 %in, %struct.St4x5* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func foo_St4x5(
+  ; CHECK:               .param .align 4 .b8 foo_St4x5_param_0[20],
+  ; CHECK:               .param .b32 foo_St4x5_param_1
+  ; CHECK:       )
+  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x5_param_1];
+  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x5_param_0];
+  ; CHECK:       st.u32  [[[R1]]], [[R2]];
+  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x5_param_0+4];
+  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
+  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x5_param_0+8];
+  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
+  ; CHECK:       ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x5_param_0+12];
+  ; CHECK:       st.u32  [[[R1]]+12], [[R5]];
+  ; CHECK:       ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x5_param_0+16];
+  ; CHECK:       st.u32  [[[R1]]+16], [[R6]];
+  ; CHECK:       ret;
+  %arrayidx = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 0
+  store i32 %1, i32* %arrayidx3, align 4
+  %arrayidx.1 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 1
+  %2 = load i32, i32* %arrayidx.1, align 4
+  %arrayidx3.1 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 1
+  store i32 %2, i32* %arrayidx3.1, align 4
+  %arrayidx.2 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 2
+  %3 = load i32, i32* %arrayidx.2, align 4
+  %arrayidx3.2 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 2
+  store i32 %3, i32* %arrayidx3.2, align 4
+  %arrayidx.3 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 3
+  %4 = load i32, i32* %arrayidx.3, align 4
+  %arrayidx3.3 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 3
+  store i32 %4, i32* %arrayidx3.3, align 4
+  %arrayidx.4 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 4
+  %5 = load i32, i32* %arrayidx.4, align 4
+  %arrayidx3.4 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 4
+  store i32 %5, i32* %arrayidx3.4, align 4
+  ret void
+}
+
+define dso_local void @foo_St4x6(%struct.St4x6* nocapture noundef readonly byval(%struct.St4x6) align 4 %in, %struct.St4x6* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func foo_St4x6(
+  ; CHECK:               .param .align 4 .b8 foo_St4x6_param_0[24],
+  ; CHECK:               .param .b32 foo_St4x6_param_1
+  ; CHECK:       )
+  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x6_param_1];
+  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x6_param_0];
+  ; CHECK:       st.u32  [[[R1]]], [[R2]];
+  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x6_param_0+4];
+  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
+  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x6_param_0+8];
+  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
+  ; CHECK:       ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x6_param_0+12];
+  ; CHECK:       st.u32  [[[R1]]+12], [[R5]];
+  ; CHECK:       ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x6_param_0+16];
+  ; CHECK:       st.u32  [[[R1]]+16], [[R6]];
+  ; CHECK:       ld.param.u32 [[R7:%r[0-9]+]], [foo_St4x6_param_0+20];
+  ; CHECK:       st.u32  [[[R1]]+20], [[R7]];
+  ; CHECK:       ret;
+  %arrayidx = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 0
+  store i32 %1, i32* %arrayidx3, align 4
+  %arrayidx.1 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 1
+  %2 = load i32, i32* %arrayidx.1, align 4
+  %arrayidx3.1 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 1
+  store i32 %2, i32* %arrayidx3.1, align 4
+  %arrayidx.2 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 2
+  %3 = load i32, i32* %arrayidx.2, align 4
+  %arrayidx3.2 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 2
+  store i32 %3, i32* %arrayidx3.2, align 4
+  %arrayidx.3 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 3
+  %4 = load i32, i32* %arrayidx.3, align 4
+  %arrayidx3.3 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 3
+  store i32 %4, i32* %arrayidx3.3, align 4
+  %arrayidx.4 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 4
+  %5 = load i32, i32* %arrayidx.4, align 4
+  %arrayidx3.4 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 4
+  store i32 %5, i32* %arrayidx3.4, align 4
+  %arrayidx.5 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 5
+  %6 = load i32, i32* %arrayidx.5, align 4
+  %arrayidx3.5 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 5
+  store i32 %6, i32* %arrayidx3.5, align 4
+  ret void
+}
+
+define dso_local void @foo_St4x7(%struct.St4x7* nocapture noundef readonly byval(%struct.St4x7) align 4 %in, %struct.St4x7* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func foo_St4x7(
+  ; CHECK:               .param .align 4 .b8 foo_St4x7_param_0[28],
+  ; CHECK:               .param .b32 foo_St4x7_param_1
+  ; CHECK:       )
+  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x7_param_1];
+  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x7_param_0];
+  ; CHECK:       st.u32  [[[R1]]], [[R2]];
+  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x7_param_0+4];
+  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
+  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x7_param_0+8];
+  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
+  ; CHECK:       ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x7_param_0+12];
+  ; CHECK:       st.u32  [[[R1]]+12], [[R5]];
+  ; CHECK:       ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x7_param_0+16];
+  ; CHECK:       st.u32  [[[R1]]+16], [[R6]];
+  ; CHECK:       ld.param.u32 [[R7:%r[0-9]+]], [foo_St4x7_param_0+20];
+  ; CHECK:       st.u32  [[[R1]]+20], [[R7]];
+  ; CHECK:       ld.param.u32 [[R8:%r[0-9]+]], [foo_St4x7_param_0+24];
+  ; CHECK:       st.u32  [[[R1]]+24], [[R8]];
+  ; CHECK:       ret;
+  %arrayidx = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 0
+  store i32 %1, i32* %arrayidx3, align 4
+  %arrayidx.1 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 1
+  %2 = load i32, i32* %arrayidx.1, align 4
+  %arrayidx3.1 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 1
+  store i32 %2, i32* %arrayidx3.1, align 4
+  %arrayidx.2 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 2
+  %3 = load i32, i32* %arrayidx.2, align 4
+  %arrayidx3.2 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 2
+  store i32 %3, i32* %arrayidx3.2, align 4
+  %arrayidx.3 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 3
+  %4 = load i32, i32* %arrayidx.3, align 4
+  %arrayidx3.3 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 3
+  store i32 %4, i32* %arrayidx3.3, align 4
+  %arrayidx.4 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 4
+  %5 = load i32, i32* %arrayidx.4, align 4
+  %arrayidx3.4 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 4
+  store i32 %5, i32* %arrayidx3.4, align 4
+  %arrayidx.5 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 5
+  %6 = load i32, i32* %arrayidx.5, align 4
+  %arrayidx3.5 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 5
+  store i32 %6, i32* %arrayidx3.5, align 4
+  %arrayidx.6 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 6
+  %7 = load i32, i32* %arrayidx.6, align 4
+  %arrayidx3.6 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 6
+  store i32 %7, i32* %arrayidx3.6, align 4
+  ret void
+}
+
+define dso_local void @foo_St4x8(%struct.St4x8* nocapture noundef readonly byval(%struct.St4x8) align 4 %in, %struct.St4x8* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func foo_St4x8(
+  ; CHECK:               .param .align 4 .b8 foo_St4x8_param_0[32],
+  ; CHECK:               .param .b32 foo_St4x8_param_1
+  ; CHECK:       )
+  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x8_param_1];
+  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x8_param_0];
+  ; CHECK:       st.u32  [[[R1]]], [[R2]];
+  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x8_param_0+4];
+  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
+  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x8_param_0+8];
+  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
+  ; CHECK:       ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x8_param_0+12];
+  ; CHECK:       st.u32  [[[R1]]+12], [[R5]];
+  ; CHECK:       ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x8_param_0+16];
+  ; CHECK:       st.u32  [[[R1]]+16], [[R6]];
+  ; CHECK:       ld.param.u32 [[R7:%r[0-9]+]], [foo_St4x8_param_0+20];
+  ; CHECK:       st.u32  [[[R1]]+20], [[R7]];
+  ; CHECK:       ld.param.u32 [[R8:%r[0-9]+]], [foo_St4x8_param_0+24];
+  ; CHECK:       st.u32  [[[R1]]+24], [[R8]];
+  ; CHECK:       ld.param.u32 [[R9:%r[0-9]+]], [foo_St4x8_param_0+28];
+  ; CHECK:       st.u32  [[[R1]]+28], [[R9]];
+  ; CHECK:       ret;
+  %arrayidx = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 0
+  store i32 %1, i32* %arrayidx3, align 4
+  %arrayidx.1 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 1
+  %2 = load i32, i32* %arrayidx.1, align 4
+  %arrayidx3.1 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 1
+  store i32 %2, i32* %arrayidx3.1, align 4
+  %arrayidx.2 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 2
+  %3 = load i32, i32* %arrayidx.2, align 4
+  %arrayidx3.2 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 2
+  store i32 %3, i32* %arrayidx3.2, align 4
+  %arrayidx.3 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 3
+  %4 = load i32, i32* %arrayidx.3, align 4
+  %arrayidx3.3 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 3
+  store i32 %4, i32* %arrayidx3.3, align 4
+  %arrayidx.4 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 4
+  %5 = load i32, i32* %arrayidx.4, align 4
+  %arrayidx3.4 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 4
+  store i32 %5, i32* %arrayidx3.4, align 4
+  %arrayidx.5 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 5
+  %6 = load i32, i32* %arrayidx.5, align 4
+  %arrayidx3.5 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 5
+  store i32 %6, i32* %arrayidx3.5, align 4
+  %arrayidx.6 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 6
+  %7 = load i32, i32* %arrayidx.6, align 4
+  %arrayidx3.6 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 6
+  store i32 %7, i32* %arrayidx3.6, align 4
+  %arrayidx.7 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 7
+  %8 = load i32, i32* %arrayidx.7, align 4
+  %arrayidx3.7 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 7
+  store i32 %8, i32* %arrayidx3.7, align 4
+  ret void
+}
+
+define dso_local void @foo_St8x1(%struct.St8x1* nocapture noundef readonly byval(%struct.St8x1) align 8 %in, %struct.St8x1* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func foo_St8x1(
+  ; CHECK:               .param .align 8 .b8 foo_St8x1_param_0[8],
+  ; CHECK:               .param .b32 foo_St8x1_param_1
+  ; CHECK:       )
+  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St8x1_param_1];
+  ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x1_param_0];
+  ; CHECK:       st.u64 [[[R1]]], [[RD1]];
+  ; CHECK:       ret;
+  %arrayidx = getelementptr inbounds %struct.St8x1, %struct.St8x1* %in, i64 0, i32 0, i64 0
+  %1 = load i64, i64* %arrayidx, align 8
+  %arrayidx3 = getelementptr inbounds %struct.St8x1, %struct.St8x1* %ret, i64 0, i32 0, i64 0
+  store i64 %1, i64* %arrayidx3, align 8
+  ret void
+}
+
+define dso_local void @foo_St8x2(%struct.St8x2* nocapture noundef readonly byval(%struct.St8x2) align 8 %in, %struct.St8x2* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func foo_St8x2(
+  ; CHECK:               .param .align 8 .b8 foo_St8x2_param_0[16],
+  ; CHECK:               .param .b32 foo_St8x2_param_1
+  ; CHECK:       )
+  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St8x2_param_1];
+  ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x2_param_0];
+  ; CHECK:       st.u64 [[[R1]]], [[RD1]];
+  ; CHECK:       ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x2_param_0+8];
+  ; CHECK:       st.u64 [[[R1]]+8], [[RD2]];
+  ; CHECK:       ret;
+  %arrayidx = getelementptr inbounds %struct.St8x2, %struct.St8x2* %in, i64 0, i32 0, i64 0
+  %1 = load i64, i64* %arrayidx, align 8
+  %arrayidx3 = getelementptr inbounds %struct.St8x2, %struct.St8x2* %ret, i64 0, i32 0, i64 0
+  store i64 %1, i64* %arrayidx3, align 8
+  %arrayidx.1 = getelementptr inbounds %struct.St8x2, %struct.St8x2* %in, i64 0, i32 0, i64 1
+  %2 = load i64, i64* %arrayidx.1, align 8
+  %arrayidx3.1 = getelementptr inbounds %struct.St8x2, %struct.St8x2* %ret, i64 0, i32 0, i64 1
+  store i64 %2, i64* %arrayidx3.1, align 8
+  ret void
+}
+
+define dso_local void @foo_St8x3(%struct.St8x3* nocapture noundef readonly byval(%struct.St8x3) align 8 %in, %struct.St8x3* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func foo_St8x3(
+  ; CHECK:               .param .align 8 .b8 foo_St8x3_param_0[24],
+  ; CHECK:               .param .b32 foo_St8x3_param_1
+  ; CHECK:       )
+  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St8x3_param_1];
+  ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x3_param_0];
+  ; CHECK:       st.u64 [[[R1]]], [[RD1]];
+  ; CHECK:       ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x3_param_0+8];
+  ; CHECK:       st.u64 [[[R1]]+8], [[RD2]];
+  ; CHECK:       ld.param.u64 [[RD3:%rd[0-9]+]], [foo_St8x3_param_0+16];
+  ; CHECK:       st.u64 [[[R1]]+16], [[RD3]];
+  ; CHECK:       ret;
+  %arrayidx = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 0
+  %1 = load i64, i64* %arrayidx, align 8
+  %arrayidx3 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 0
+  store i64 %1, i64* %arrayidx3, align 8
+  %arrayidx.1 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 1
+  %2 = load i64, i64* %arrayidx.1, align 8
+  %arrayidx3.1 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 1
+  store i64 %2, i64* %arrayidx3.1, align 8
+  %arrayidx.2 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 2
+  %3 = load i64, i64* %arrayidx.2, align 8
+  %arrayidx3.2 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 2
+  store i64 %3, i64* %arrayidx3.2, align 8
+  ret void
+}
+
+define dso_local void @foo_St8x4(%struct.St8x4* nocapture noundef readonly byval(%struct.St8x4) align 8 %in, %struct.St8x4* nocapture noundef writeonly %ret) {
+  ; CHECK-LABEL: .visible .func foo_St8x4(
+  ; CHECK:               .param .align 8 .b8 foo_St8x4_param_0[32],
+  ; CHECK:               .param .b32 foo_St8x4_param_1
+  ; CHECK:       )
+  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St8x4_param_1];
+  ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x4_param_0];
+  ; CHECK:       st.u64 [[[R1]]], [[RD1]];
+  ; CHECK:       ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x4_param_0+8];
+  ; CHECK:       st.u64 [[[R1]]+8], [[RD2]];
+  ; CHECK:       ld.param.u64 [[RD3:%rd[0-9]+]], [foo_St8x4_param_0+16];
+  ; CHECK:       st.u64 [[[R1]]+16], [[RD3]];
+  ; CHECK:       ld.param.u64 [[RD4:%rd[0-9]+]], [foo_St8x4_param_0+24];
+  ; CHECK:       st.u64 [[[R1]]+24], [[RD4]];
+  ; CHECK:       ret;
+  %arrayidx = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 0
+  %1 = load i64, i64* %arrayidx, align 8
+  %arrayidx3 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 0
+  store i64 %1, i64* %arrayidx3, align 8
+  %arrayidx.1 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 1
+  %2 = load i64, i64* %arrayidx.1, align 8
+  %arrayidx3.1 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 1
+  store i64 %2, i64* %arrayidx3.1, align 8
+  %arrayidx.2 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 2
+  %3 = load i64, i64* %arrayidx.2, align 8
+  %arrayidx3.2 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 2
+  store i64 %3, i64* %arrayidx3.2, align 8
+  %arrayidx.3 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 3
+  %4 = load i64, i64* %arrayidx.3, align 8
+  %arrayidx3.3 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 3
+  store i64 %4, i64* %arrayidx3.3, align 8
+  ret void
+}


        


More information about the cfe-commits mailing list