[clang] a034878 - Revert "[NVPTX] Enhance vectorization of ld.param & st.param"

Daniil Kovalev via cfe-commits cfe-commits at lists.llvm.org
Thu Mar 24 02:32:58 PDT 2022


Author: Daniil Kovalev
Date: 2022-03-24T12:32:06+03:00
New Revision: a0348785649271e8c63a42bd4a83a2fefa96efe0

URL: https://github.com/llvm/llvm-project/commit/a0348785649271e8c63a42bd4a83a2fefa96efe0
DIFF: https://github.com/llvm/llvm-project/commit/a0348785649271e8c63a42bd4a83a2fefa96efe0.diff

LOG: Revert "[NVPTX] Enhance vectorization of ld.param & st.param"

This reverts commit f854434f0f2a01027bdaad8e6fdac5a782fce291.

Placed URL to wrong differential revision in commit message.

Added: 
    

Modified: 
    clang/test/CodeGenCUDA/device-fun-linkage.cu
    llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
    llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
    llvm/lib/Target/NVPTX/NVPTXISelLowering.h
    llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp

Removed: 
    llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
    llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll


################################################################################
diff  --git a/clang/test/CodeGenCUDA/device-fun-linkage.cu b/clang/test/CodeGenCUDA/device-fun-linkage.cu
index d8ad6d438be9c..d1b9db261151b 100644
--- a/clang/test/CodeGenCUDA/device-fun-linkage.cu
+++ b/clang/test/CodeGenCUDA/device-fun-linkage.cu
@@ -1,32 +1,19 @@
-// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
+// RUN:   -emit-llvm -o - %s \
 // RUN:   | FileCheck -check-prefix=NORDC %s
-// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -emit-llvm -o - %s \
-// RUN:   | FileCheck -check-prefix=NORDC-NEG %s
-// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -fgpu-rdc -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
+// RUN:   -fgpu-rdc -emit-llvm -o - %s \
 // RUN:   | FileCheck -check-prefix=RDC %s
-// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -fgpu-rdc -emit-llvm -o - %s \
-// RUN:   | FileCheck -check-prefix=RDC-NEG %s
 
 #include "Inputs/cuda.h"
 
+// NORDC: define internal void @_Z4funcIiEvv()
+// NORDC: define{{.*}} void @_Z6kernelIiEvv()
+// RDC: define weak_odr void @_Z4funcIiEvv()
+// RDC: define weak_odr void @_Z6kernelIiEvv()
+
 template <typename T> __device__ void func() {}
 template <typename T> __global__ void kernel() {}
 
 template __device__ void func<int>();
-// NORDC:     define internal void @_Z4funcIiEvv()
-// RDC:       define weak_odr void @_Z4funcIiEvv()
-
 template __global__ void kernel<int>();
-// NORDC:     define void @_Z6kernelIiEvv()
-// RDC:       define weak_odr void @_Z6kernelIiEvv()
-
-// Ensure that unused static device function is eliminated
-static __device__ void static_func() {}
-// NORDC-NEG-NOT: define{{.*}} void @_ZL13static_funcv()
-// RDC-NEG-NOT:   define{{.*}} void @_ZL13static_funcv()
-
-// Ensure that kernel function has external or weak_odr
-// linkage regardless static specifier
-static __global__ void static_kernel() {}
-// NORDC:     define void @_ZL13static_kernelv()
-// RDC:       define weak_odr void @_ZL13static_kernelv()

diff  --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index e8322a0a8425b..2516dff52efdf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -329,7 +329,7 @@ MCOperand NVPTXAsmPrinter::GetSymbolRef(const MCSymbol *Symbol) {
 void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
   const DataLayout &DL = getDataLayout();
   const NVPTXSubtarget &STI = TM.getSubtarget<NVPTXSubtarget>(*F);
-  const auto *TLI = cast<NVPTXTargetLowering>(STI.getTargetLowering());
+  const TargetLowering *TLI = STI.getTargetLowering();
 
   Type *Ty = F->getReturnType();
 
@@ -363,7 +363,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
       unsigned totalsz = DL.getTypeAllocSize(Ty);
       unsigned retAlignment = 0;
       if (!getAlign(*F, 0, retAlignment))
-        retAlignment = TLI->getFunctionParamOptimizedAlign(F, Ty, DL).value();
+        retAlignment = DL.getABITypeAlignment(Ty);
       O << ".param .align " << retAlignment << " .b8 func_retval0[" << totalsz
         << "]";
     } else
@@ -1348,8 +1348,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
   const DataLayout &DL = getDataLayout();
   const AttributeList &PAL = F->getAttributes();
   const NVPTXSubtarget &STI = TM.getSubtarget<NVPTXSubtarget>(*F);
-  const auto *TLI = cast<NVPTXTargetLowering>(STI.getTargetLowering());
-
+  const TargetLowering *TLI = STI.getTargetLowering();
   Function::const_arg_iterator I, E;
   unsigned paramIndex = 0;
   bool first = true;
@@ -1406,24 +1405,18 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       }
     }
 
-    auto getOptimalAlignForParam = [TLI, &DL, &PAL, F,
-                                    paramIndex](Type *Ty) -> Align {
-      Align TypeAlign = TLI->getFunctionParamOptimizedAlign(F, Ty, DL);
-      MaybeAlign ParamAlign = PAL.getParamAlignment(paramIndex);
-      return max(TypeAlign, ParamAlign);
-    };
-
     if (!PAL.hasParamAttr(paramIndex, Attribute::ByVal)) {
       if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
         // Just print .param .align <a> .b8 .param[size];
-        // <a>  = optimal alignment for the element type; always multiple of
-        //        PAL.getParamAlignment
+        // <a> = PAL.getparamalignment
         // size = typeallocsize of element type
-        Align OptimalAlign = getOptimalAlignForParam(Ty);
+        const Align align = DL.getValueOrABITypeAlignment(
+            PAL.getParamAlignment(paramIndex), Ty);
 
-        O << "\t.param .align " << OptimalAlign.value() << " .b8 ";
+        unsigned sz = DL.getTypeAllocSize(Ty);
+        O << "\t.param .align " << align.value() << " .b8 ";
         printParamName(I, paramIndex, O);
-        O << "[" << DL.getTypeAllocSize(Ty) << "]";
+        O << "[" << sz << "]";
 
         continue;
       }
@@ -1499,11 +1492,10 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
 
     if (isABI || isKernelFunc) {
       // Just print .param .align <a> .b8 .param[size];
-      // <a>  = optimal alignment for the element type; always multiple of
-      //        PAL.getParamAlignment
+      // <a> = PAL.getparamalignment
       // size = typeallocsize of element type
-      Align OptimalAlign = getOptimalAlignForParam(ETy);
-
+      Align align =
+          DL.getValueOrABITypeAlignment(PAL.getParamAlignment(paramIndex), ETy);
       // Work around a bug in ptxas. When PTX code takes address of
       // byval parameter with alignment < 4, ptxas generates code to
       // spill argument into memory. Alas on sm_50+ ptxas generates
@@ -1515,10 +1507,10 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       // TODO: this will need to be undone when we get to support multi-TU
       // device-side compilation as it breaks ABI compatibility with nvcc.
       // Hopefully ptxas bug is fixed by then.
-      if (!isKernelFunc && OptimalAlign < Align(4))
-        OptimalAlign = Align(4);
+      if (!isKernelFunc && align < Align(4))
+        align = Align(4);
       unsigned sz = DL.getTypeAllocSize(ETy);
-      O << "\t.param .align " << OptimalAlign.value() << " .b8 ";
+      O << "\t.param .align " << align.value() << " .b8 ";
       printParamName(I, paramIndex, O);
       O << "[" << sz << "]";
       continue;

diff  --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 382e83dbb4cb9..2cda034f047c1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1302,8 +1302,8 @@ std::string NVPTXTargetLowering::getPrototype(
 
   bool first = true;
 
-  const Function *F = CB.getFunction();
-  for (unsigned i = 0, e = Args.size(), OIdx = 0; i != e; ++i, ++OIdx) {
+  unsigned OIdx = 0;
+  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
     Type *Ty = Args[i].Ty;
     if (!first) {
       O << ", ";
@@ -1312,14 +1312,15 @@ std::string NVPTXTargetLowering::getPrototype(
 
     if (!Outs[OIdx].Flags.isByVal()) {
       if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
-        unsigned ParamAlign = 0;
+        unsigned align = 0;
         const CallInst *CallI = cast<CallInst>(&CB);
         // +1 because index 0 is reserved for return type alignment
-        if (!getAlign(*CallI, i + 1, ParamAlign))
-          ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value();
-        O << ".param .align " << ParamAlign << " .b8 ";
+        if (!getAlign(*CallI, i + 1, align))
+          align = DL.getABITypeAlignment(Ty);
+        unsigned sz = DL.getTypeAllocSize(Ty);
+        O << ".param .align " << align << " .b8 ";
         O << "_";
-        O << "[" << DL.getTypeAllocSize(Ty) << "]";
+        O << "[" << sz << "]";
         // update the index for Outs
         SmallVector<EVT, 16> vtparts;
         ComputeValueVTs(*this, DL, Ty, vtparts);
@@ -1351,17 +1352,11 @@ std::string NVPTXTargetLowering::getPrototype(
       continue;
     }
 
-    Align ParamByValAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
-
-    // Try to increase alignment. This code matches logic in LowerCall when
-    // alignment increase is performed to increase vectorization options.
-    Type *ETy = Args[i].IndirectType;
-    Align AlignCandidate = getFunctionParamOptimizedAlign(F, ETy, DL);
-    ParamByValAlign = std::max(ParamByValAlign, AlignCandidate);
-
-    O << ".param .align " << ParamByValAlign.value() << " .b8 ";
+    Align align = Outs[OIdx].Flags.getNonZeroByValAlign();
+    unsigned sz = Outs[OIdx].Flags.getByValSize();
+    O << ".param .align " << align.value() << " .b8 ";
     O << "_";
-    O << "[" << Outs[OIdx].Flags.getByValSize() << "]";
+    O << "[" << sz << "]";
   }
   O << ");";
   return O.str();
@@ -1408,15 +1403,12 @@ Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
 
   // Check for function alignment information if we found that the
   // ultimate target is a Function
-  if (DirectCallee) {
+  if (DirectCallee)
     if (getAlign(*DirectCallee, Idx, Alignment))
       return Align(Alignment);
-    // If alignment information is not available, fall back to the
-    // default function param optimized type alignment
-    return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL);
-  }
 
-  // Call is indirect, fall back to the ABI type alignment
+  // Call is indirect or alignment information is not available, fall back to
+  // the ABI type alignment
   return DL.getABITypeAlign(Ty);
 }
 
@@ -1577,26 +1569,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     }
 
     // ByVal arguments
-    // TODO: remove code duplication when handling byval and non-byval cases.
     SmallVector<EVT, 16> VTs;
     SmallVector<uint64_t, 16> Offsets;
-    Type *ETy = Args[i].IndirectType;
-    assert(ETy && "byval arg must have indirect type");
-    ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, 0);
+    assert(Args[i].IndirectType && "byval arg must have indirect type");
+    ComputePTXValueVTs(*this, DL, Args[i].IndirectType, VTs, &Offsets, 0);
 
     // declare .param .align <align> .b8 .param<n>[<size>];
     unsigned sz = Outs[OIdx].Flags.getByValSize();
     SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-
+    Align ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
     // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
     // so we don't need to worry about natural alignment or not.
     // See TargetLowering::LowerCallTo().
-    Align ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
-
-    // Try to increase alignment to enhance vectorization options.
-    const Function *F = CB->getCalledFunction();
-    Align AlignCandidate = getFunctionParamOptimizedAlign(F, ETy, DL);
-    ArgAlign = std::max(ArgAlign, AlignCandidate);
 
     // Enforce minumum alignment of 4 to work around ptxas miscompile
     // for sm_50+. See corresponding alignment adjustment in
@@ -1610,67 +1594,29 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
                         DeclareParamOps);
     InFlag = Chain.getValue(1);
-
-    auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
-    SmallVector<SDValue, 6> StoreOperands;
     for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
       EVT elemtype = VTs[j];
       int curOffset = Offsets[j];
-      Align PartAlign = commonAlignment(ArgAlign, curOffset);
-
-      // New store.
-      if (VectorInfo[j] & PVF_FIRST) {
-        assert(StoreOperands.empty() && "Unfinished preceding store.");
-        StoreOperands.push_back(Chain);
-        StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
-        StoreOperands.push_back(DAG.getConstant(curOffset, dl, MVT::i32));
-      }
-
+      unsigned PartAlign = GreatestCommonDivisor64(ArgAlign.value(), curOffset);
       auto PtrVT = getPointerTy(DL);
       SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
                                     DAG.getConstant(curOffset, dl, PtrVT));
       SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
                                    MachinePointerInfo(), PartAlign);
-
       if (elemtype.getSizeInBits() < 16) {
-        // Use 16-bit registers for small stores as it's the
-        // smallest general purpose register size supported by NVPTX.
         theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
       }
+      SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue CopyParamOps[] = { Chain,
+                                 DAG.getConstant(paramCount, dl, MVT::i32),
+                                 DAG.getConstant(curOffset, dl, MVT::i32),
+                                 theVal, InFlag };
+      Chain = DAG.getMemIntrinsicNode(
+          NVPTXISD::StoreParam, dl, CopyParamVTs, CopyParamOps, elemtype,
+          MachinePointerInfo(), /* Align */ None, MachineMemOperand::MOStore);
 
-      // Record the value to store.
-      StoreOperands.push_back(theVal);
-
-      if (VectorInfo[j] & PVF_LAST) {
-        unsigned NumElts = StoreOperands.size() - 3;
-        NVPTXISD::NodeType Op;
-        switch (NumElts) {
-        case 1:
-          Op = NVPTXISD::StoreParam;
-          break;
-        case 2:
-          Op = NVPTXISD::StoreParamV2;
-          break;
-        case 4:
-          Op = NVPTXISD::StoreParamV4;
-          break;
-        default:
-          llvm_unreachable("Invalid vector info.");
-        }
-
-        StoreOperands.push_back(InFlag);
-
-        Chain = DAG.getMemIntrinsicNode(
-            Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
-            elemtype, MachinePointerInfo(), PartAlign,
-            MachineMemOperand::MOStore);
-        InFlag = Chain.getValue(1);
-
-        // Cleanup.
-        StoreOperands.clear();
-      }
+      InFlag = Chain.getValue(1);
     }
-    assert(StoreOperands.empty() && "Unfinished parameter store.");
     ++paramCount;
   }
 
@@ -2671,8 +2617,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
                                  const SmallVectorImpl<SDValue> &OutVals,
                                  const SDLoc &dl, SelectionDAG &DAG) const {
-  const MachineFunction &MF = DAG.getMachineFunction();
-  const Function &F = MF.getFunction();
+  MachineFunction &MF = DAG.getMachineFunction();
   Type *RetTy = MF.getFunction().getReturnType();
 
   bool isABI = (STI.getSmVersion() >= 20);
@@ -2687,9 +2632,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
 
   auto VectorInfo = VectorizePTXValueVTs(
-      VTs, Offsets,
-      RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL)
-                       : Align(1));
+      VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlign(RetTy) : Align(1));
 
   // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
   // 32-bits are sign extended or zero extended, depending on whether
@@ -4309,55 +4252,6 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   return false;
 }
 
-/// getFunctionParamOptimizedAlign - since function arguments are passed via
-/// .param space, we may want to increase their alignment in a way that
-/// ensures that we can effectively vectorize their loads & stores. We can
-/// increase alignment only if the function has internal or has private
-/// linkage as for other linkage types callers may already rely on default
-/// alignment. To allow using 128-bit vectorized loads/stores, this function
-/// ensures that alignment is 16 or greater.
-Align NVPTXTargetLowering::getFunctionParamOptimizedAlign(
-    const Function *F, Type *ArgTy, const DataLayout &DL) const {
-  const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value();
-
-  // If a function has linkage 
diff erent from internal or private, we
-  // must use default ABI alignment as external users rely on it.
-  switch (F->getLinkage()) {
-  case GlobalValue::InternalLinkage:
-  case GlobalValue::PrivateLinkage: {
-    // Check that if a function has internal or private linkage
-    // it is not a kernel.
-#ifndef NDEBUG
-    const NamedMDNode *NMDN =
-        F->getParent()->getNamedMetadata("nvvm.annotations");
-    if (NMDN) {
-      for (const MDNode *MDN : NMDN->operands()) {
-        assert(MDN->getNumOperands() == 3);
-
-        const Metadata *MD0 = MDN->getOperand(0).get();
-        const auto *MDV0 = cast<ConstantAsMetadata>(MD0)->getValue();
-        const auto *MDFn = cast<Function>(MDV0);
-        if (MDFn != F)
-          continue;
-
-        const Metadata *MD1 = MDN->getOperand(1).get();
-        const MDString *MDStr = cast<MDString>(MD1);
-        if (MDStr->getString() != "kernel")
-          continue;
-
-        const Metadata *MD2 = MDN->getOperand(2).get();
-        const auto *MDV2 = cast<ConstantAsMetadata>(MD2)->getValue();
-        assert(!cast<ConstantInt>(MDV2)->isZero());
-      }
-    }
-#endif
-    return Align(std::max(uint64_t(16), ABITypeAlign));
-  }
-  default:
-    return Align(ABITypeAlign);
-  }
-}
-
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
 /// Used to guide target specific optimizations, like loop strength reduction

diff  --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 18a697deacb44..13829b924d4b4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -451,16 +451,6 @@ class NVPTXTargetLowering : public TargetLowering {
                           MachineFunction &MF,
                           unsigned Intrinsic) const override;
 
-  /// getFunctionParamOptimizedAlign - since function arguments are passed via
-  /// .param space, we may want to increase their alignment in a way that
-  /// ensures that we can effectively vectorize their loads & stores. We can
-  /// increase alignment only if the function has internal or has private
-  /// linkage as for other linkage types callers may already rely on default
-  /// alignment. To allow using 128-bit vectorized loads/stores, this function
-  /// ensures that alignment is 16 or greater.
-  Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy,
-                                       const DataLayout &DL) const;
-
   /// isLegalAddressingMode - Return true if the addressing mode represented
   /// by AM is legal for this target, for a load/store of the specified type
   /// Used to guide target specific optimizations, like loop strength

diff  --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 19b04f49d76ca..6183019de43df 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -88,17 +88,16 @@
 // cancel the addrspacecast pair this pass emits.
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/NVPTXBaseInfo.h"
 #include "NVPTX.h"
 #include "NVPTXTargetMachine.h"
 #include "NVPTXUtilities.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
-#include <queue>
 
 #define DEBUG_TYPE "nvptx-lower-args"
 
@@ -227,90 +226,6 @@ static void convertToParamAS(Value *OldUser, Value *Param) {
                  [](Instruction *I) { I->eraseFromParent(); });
 }
 
-// Adjust alignment of arguments passed byval in .param address space. We can
-// increase alignment of such arguments in a way that ensures that we can
-// effectively vectorize their loads. We should also traverse all loads from
-// byval pointer and adjust their alignment, if those were using known offset.
-// Such alignment changes must be conformed with parameter store and load in
-// NVPTXTargetLowering::LowerCall.
-static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
-                                    const NVPTXTargetLowering *TLI) {
-  Function *Func = Arg->getParent();
-  Type *StructType = Arg->getParamByValType();
-  const DataLayout DL(Func->getParent());
-
-  uint64_t NewArgAlign =
-      TLI->getFunctionParamOptimizedAlign(Func, StructType, DL).value();
-  uint64_t CurArgAlign =
-      Arg->getAttribute(Attribute::Alignment).getValueAsInt();
-
-  if (CurArgAlign >= NewArgAlign)
-    return;
-
-  LLVM_DEBUG(dbgs() << "Try to use alignment " << NewArgAlign << " instead of "
-                    << CurArgAlign << " for " << *Arg << '\n');
-
-  auto NewAlignAttr =
-      Attribute::get(Func->getContext(), Attribute::Alignment, NewArgAlign);
-  Arg->removeAttr(Attribute::Alignment);
-  Arg->addAttr(NewAlignAttr);
-
-  struct Load {
-    LoadInst *Inst;
-    uint64_t Offset;
-  };
-
-  struct LoadContext {
-    Value *InitialVal;
-    uint64_t Offset;
-  };
-
-  SmallVector<Load> Loads;
-  std::queue<LoadContext> Worklist;
-  Worklist.push({ArgInParamAS, 0});
-
-  while (!Worklist.empty()) {
-    LoadContext Ctx = Worklist.front();
-    Worklist.pop();
-
-    for (User *CurUser : Ctx.InitialVal->users()) {
-      if (auto *I = dyn_cast<LoadInst>(CurUser)) {
-        Loads.push_back({I, Ctx.Offset});
-        continue;
-      }
-
-      if (auto *I = dyn_cast<BitCastInst>(CurUser)) {
-        Worklist.push({I, Ctx.Offset});
-        continue;
-      }
-
-      if (auto *I = dyn_cast<GetElementPtrInst>(CurUser)) {
-        APInt OffsetAccumulated =
-            APInt::getZero(DL.getIndexSizeInBits(ADDRESS_SPACE_PARAM));
-
-        if (!I->accumulateConstantOffset(DL, OffsetAccumulated))
-          continue;
-
-        uint64_t OffsetLimit = -1;
-        uint64_t Offset = OffsetAccumulated.getLimitedValue(OffsetLimit);
-        assert(Offset != OffsetLimit && "Expect Offset less than UINT64_MAX");
-
-        Worklist.push({I, Ctx.Offset + Offset});
-        continue;
-      }
-
-      llvm_unreachable("All users must be one of: load, "
-                       "bitcast, getelementptr.");
-    }
-  }
-
-  for (Load &CurLoad : Loads) {
-    Align NewLoadAlign(greatestCommonDivisor(NewArgAlign, CurLoad.Offset));
-    Align CurLoadAlign(CurLoad.Inst->getAlign());
-    CurLoad.Inst->setAlignment(std::max(NewLoadAlign, CurLoadAlign));
-  }
-}
-
 void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
   Function *Func = Arg->getParent();
   Instruction *FirstInst = &(Func->getEntryBlock().front());
@@ -355,16 +270,6 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
       convertToParamAS(V, ArgInParamAS);
     });
     LLVM_DEBUG(dbgs() << "No need to copy " << *Arg << "\n");
-
-    // Further optimizations require target lowering info.
-    if (!TM)
-      return;
-
-    const auto *TLI =
-        cast<NVPTXTargetLowering>(TM->getSubtargetImpl()->getTargetLowering());
-
-    adjustByValArgAlignment(Arg, ArgInParamAS, TLI);
-
     return;
   }
 

diff  --git a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
deleted file mode 100644
index 681faf0bfa534..0000000000000
--- a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
+++ /dev/null
@@ -1,801 +0,0 @@
-; RUN: llc < %s -mtriple=nvptx-unknown-unknown | FileCheck %s
-;
-; Check that parameters of a __device__ function with private or internal
-; linkage called from a __global__ (kernel) function get increased alignment,
-; and additional vectorization is performed on loads/stores with that
-; parameters.
-;
-; Test IR is a minimized version of IR generated with the following command
-; from the source code below:
-; $ clang++ -O3 --cuda-gpu-arch=sm_35 -S -emit-llvm src.cu
-;
-; ----------------------------------------------------------------------------
-; #include <stdint.h>
-;
-; struct St4x1 { uint32_t field[1]; };
-; struct St4x2 { uint32_t field[2]; };
-; struct St4x3 { uint32_t field[3]; };
-; struct St4x4 { uint32_t field[4]; };
-; struct St4x5 { uint32_t field[5]; };
-; struct St4x6 { uint32_t field[6]; };
-; struct St4x7 { uint32_t field[7]; };
-; struct St4x8 { uint32_t field[8]; };
-; struct St8x1 { uint64_t field[1]; };
-; struct St8x2 { uint64_t field[2]; };
-; struct St8x3 { uint64_t field[3]; };
-; struct St8x4 { uint64_t field[4]; };
-;
-; #define DECLARE_CALLEE(StName)                                      \
-; static __device__  __attribute__((noinline))                        \
-; struct StName callee_##StName(struct StName in) {                   \
-;   struct StName ret;                                                \
-;   const unsigned size = sizeof(ret.field) / sizeof(*ret.field);     \
-;   for (unsigned i = 0; i != size; ++i)                              \
-;     ret.field[i] = in.field[i];                                     \
-;   return ret;                                                       \
-; }                                                                   \
-
-; #define DECLARE_CALLER(StName)                                      \
-; __global__                                                          \
-; void caller_##StName(struct StName in, struct StName* ret)          \
-; {                                                                   \
-;   *ret = callee_##StName(in);                                       \
-; }                                                                   \
-;
-; #define DECLARE_CALL(StName)  \
-;     DECLARE_CALLEE(StName)    \
-;     DECLARE_CALLER(StName)    \
-;
-; DECLARE_CALL(St4x1)
-; DECLARE_CALL(St4x2)
-; DECLARE_CALL(St4x3)
-; DECLARE_CALL(St4x4)
-; DECLARE_CALL(St4x5)
-; DECLARE_CALL(St4x6)
-; DECLARE_CALL(St4x7)
-; DECLARE_CALL(St4x8)
-; DECLARE_CALL(St8x1)
-; DECLARE_CALL(St8x2)
-; DECLARE_CALL(St8x3)
-; DECLARE_CALL(St8x4)
-; ----------------------------------------------------------------------------
-
-%struct.St4x1 = type { [1 x i32] }
-%struct.St4x2 = type { [2 x i32] }
-%struct.St4x3 = type { [3 x i32] }
-%struct.St4x4 = type { [4 x i32] }
-%struct.St4x5 = type { [5 x i32] }
-%struct.St4x6 = type { [6 x i32] }
-%struct.St4x7 = type { [7 x i32] }
-%struct.St4x8 = type { [8 x i32] }
-%struct.St8x1 = type { [1 x i64] }
-%struct.St8x2 = type { [2 x i64] }
-%struct.St8x3 = type { [3 x i64] }
-%struct.St8x4 = type { [4 x i64] }
-
-; Section 1 - checking that:
-; - function argument (including retval) vectorization is done with internal linkage;
-; - caller and callee specify correct alignment for callee's params.
-
-define dso_local void @caller_St4x1(%struct.St4x1* nocapture noundef readonly byval(%struct.St4x1) align 4 %in, %struct.St4x1* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func caller_St4x1(
-  ; CHECK:               .param .align 4 .b8 caller_St4x1_param_0[4],
-  ; CHECK:               .param .b32 caller_St4x1_param_1
-  ; CHECK:       )
-  ; CHECK:       .param .b32 param0;
-  ; CHECK:       st.param.b32 [param0+0], {{%r[0-9]+}};
-  ; CHECK:       .param .align 16 .b8 retval0[4];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St4x1,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
-  ; CHECK:       ld.param.b32 {{%r[0-9]+}}, [retval0+0];
-  %1 = getelementptr inbounds %struct.St4x1, %struct.St4x1* %in, i64 0, i32 0, i64 0
-  %2 = load i32, i32* %1, align 4
-  %call = tail call fastcc [1 x i32] @callee_St4x1(i32 %2)
-  %.fca.0.extract = extractvalue [1 x i32] %call, 0
-  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x1, %struct.St4x1* %ret, i64 0, i32 0, i64 0
-  store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4
-  ret void
-}
-
-define internal fastcc [1 x i32] @callee_St4x1(i32 %in.0.val) {
-  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[4])
-  ; CHECK-LABEL: callee_St4x1(
-  ; CHECK-NEXT:  .param .b32 callee_St4x1_param_0
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [callee_St4x1_param_0];
-  ; CHECK:       st.param.b32 [func_retval0+0], [[R1]];
-  ; CHECK-NEXT:  ret;
-  %oldret = insertvalue [1 x i32] poison, i32 %in.0.val, 0
-  ret [1 x i32] %oldret
-}
-
-define dso_local void @caller_St4x2(%struct.St4x2* nocapture noundef readonly byval(%struct.St4x2) align 4 %in, %struct.St4x2* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func caller_St4x2(
-  ; CHECK:               .param .align 4 .b8 caller_St4x2_param_0[8],
-  ; CHECK:               .param .b32 caller_St4x2_param_1
-  ; CHECK:       )
-  ; CHECK:       .param .align 16 .b8 param0[8];
-  ; CHECK:       st.param.v2.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}};
-  ; CHECK:       .param .align 16 .b8 retval0[8];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St4x2,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
-  ; CHECK:       ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0];
-  %agg.tmp = alloca i64, align 8
-  %tmpcast = bitcast i64* %agg.tmp to %struct.St4x2*
-  %1 = bitcast %struct.St4x2* %in to i64*
-  %2 = load i64, i64* %1, align 4
-  store i64 %2, i64* %agg.tmp, align 8
-  %call = tail call fastcc [2 x i32] @callee_St4x2(%struct.St4x2* noundef nonnull byval(%struct.St4x2) align 4 %tmpcast)
-  %.fca.0.extract = extractvalue [2 x i32] %call, 0
-  %.fca.1.extract = extractvalue [2 x i32] %call, 1
-  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x2, %struct.St4x2* %ret, i64 0, i32 0, i64 0
-  store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4
-  %ref.tmp.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.St4x2, %struct.St4x2* %ret, i64 0, i32 0, i64 1
-  store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx3, align 4
-  ret void
-}
-
-define internal fastcc [2 x i32] @callee_St4x2(%struct.St4x2* nocapture noundef readonly byval(%struct.St4x2) align 4 %in) {
-  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[8])
-  ; CHECK-LABEL: callee_St4x2(
-  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x2_param_0[8]
-  ; CHECK:       ld.param.v2.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x2_param_0];
-  ; CHECK:       st.param.v2.b32 [func_retval0+0], {[[R1]], [[R2]]};
-  ; CHECK-NEXT:  ret;
-  %arrayidx = getelementptr inbounds %struct.St4x2, %struct.St4x2* %in, i64 0, i32 0, i64 0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx.1 = getelementptr inbounds %struct.St4x2, %struct.St4x2* %in, i64 0, i32 0, i64 1
-  %2 = load i32, i32* %arrayidx.1, align 4
-  %3 = insertvalue [2 x i32] poison, i32 %1, 0
-  %oldret = insertvalue [2 x i32] %3, i32 %2, 1
-  ret [2 x i32] %oldret
-}
-
-define dso_local void @caller_St4x3(%struct.St4x3* nocapture noundef readonly byval(%struct.St4x3) align 4 %in, %struct.St4x3* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func caller_St4x3(
-  ; CHECK:               .param .align 4 .b8 caller_St4x3_param_0[12],
-  ; CHECK:               .param .b32 caller_St4x3_param_1
-  ; CHECK:       )
-  ; CHECK:       .param .align 16 .b8 param0[12];
-  ; CHECK:       st.param.v2.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}};
-  ; CHECK:       st.param.b32    [param0+8], {{%r[0-9]+}};
-  ; CHECK:       .param .align 16 .b8 retval0[12];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St4x3,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
-  ; CHECK:       ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0];
-  ; CHECK:       ld.param.b32    {{%r[0-9]+}},  [retval0+8];
-  %call = tail call fastcc [3 x i32] @callee_St4x3(%struct.St4x3* noundef nonnull byval(%struct.St4x3) align 4 %in)
-  %.fca.0.extract = extractvalue [3 x i32] %call, 0
-  %.fca.1.extract = extractvalue [3 x i32] %call, 1
-  %.fca.2.extract = extractvalue [3 x i32] %call, 2
-  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 0
-  store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4
-  %ref.tmp.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 1
-  store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx2, align 4
-  %ref.tmp.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 2
-  store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx4, align 4
-  ret void
-}
-
-
-define internal fastcc [3 x i32] @callee_St4x3(%struct.St4x3* nocapture noundef readonly byval(%struct.St4x3) align 4 %in) {
-  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[12])
-  ; CHECK-LABEL: callee_St4x3(
-  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x3_param_0[12]
-  ; CHECK:       ld.param.v2.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x3_param_0];
-  ; CHECK:       ld.param.u32    [[R3:%r[0-9]+]],  [callee_St4x3_param_0+8];
-  ; CHECK:       st.param.v2.b32 [func_retval0+0], {[[R1]], [[R2]]};
-  ; CHECK:       st.param.b32    [func_retval0+8], [[R3]];
-  ; CHECK-NEXT:  ret;
-  %arrayidx = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx.1 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 1
-  %2 = load i32, i32* %arrayidx.1, align 4
-  %arrayidx.2 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 2
-  %3 = load i32, i32* %arrayidx.2, align 4
-  %4 = insertvalue [3 x i32] poison, i32 %1, 0
-  %5 = insertvalue [3 x i32] %4, i32 %2, 1
-  %oldret = insertvalue [3 x i32] %5, i32 %3, 2
-  ret [3 x i32] %oldret
-}
-
-
-define dso_local void @caller_St4x4(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in, %struct.St4x4* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func caller_St4x4(
-  ; CHECK:               .param .align 4 .b8 caller_St4x4_param_0[16],
-  ; CHECK:               .param .b32 caller_St4x4_param_1
-  ; CHECK:       )
-  ; CHECK:       .param .align 16 .b8 param0[16];
-  ; CHECK:       st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
-  ; CHECK:       .param .align 16 .b8 retval0[16];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St4x4,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
-  ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0];
-  %call = tail call fastcc [4 x i32] @callee_St4x4(%struct.St4x4* noundef nonnull byval(%struct.St4x4) align 4 %in)
-  %.fca.0.extract = extractvalue [4 x i32] %call, 0
-  %.fca.1.extract = extractvalue [4 x i32] %call, 1
-  %.fca.2.extract = extractvalue [4 x i32] %call, 2
-  %.fca.3.extract = extractvalue [4 x i32] %call, 3
-  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 0
-  store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4
-  %ref.tmp.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 1
-  store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx3, align 4
-  %ref.tmp.sroa.5.0..sroa_idx5 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 2
-  store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx5, align 4
-  %ref.tmp.sroa.6.0..sroa_idx7 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 3
-  store i32 %.fca.3.extract, i32* %ref.tmp.sroa.6.0..sroa_idx7, align 4
-  ret void
-}
-
-
-define internal fastcc [4 x i32] @callee_St4x4(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in) {
-  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[16])
-  ; CHECK-LABEL: callee_St4x4(
-  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x4_param_0[16]
-  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_param_0];
-  ; CHECK:       st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]};
-  ; CHECK-NEXT:  ret;
-  %arrayidx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 1
-  %2 = load i32, i32* %arrayidx.1, align 4
-  %arrayidx.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 2
-  %3 = load i32, i32* %arrayidx.2, align 4
-  %arrayidx.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 3
-  %4 = load i32, i32* %arrayidx.3, align 4
-  %5 = insertvalue [4 x i32] poison, i32 %1, 0
-  %6 = insertvalue [4 x i32] %5, i32 %2, 1
-  %7 = insertvalue [4 x i32] %6, i32 %3, 2
-  %oldret = insertvalue [4 x i32] %7, i32 %4, 3
-  ret [4 x i32] %oldret
-}
-
-
-define dso_local void @caller_St4x5(%struct.St4x5* nocapture noundef readonly byval(%struct.St4x5) align 4 %in, %struct.St4x5* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func caller_St4x5(
-  ; CHECK:               .param .align 4 .b8 caller_St4x5_param_0[20],
-  ; CHECK:               .param .b32 caller_St4x5_param_1
-  ; CHECK:       )
-  ; CHECK:       .param .align 16 .b8 param0[20];
-  ; CHECK:       st.param.v4.b32 [param0+0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
-  ; CHECK:       st.param.b32    [param0+16], {{%r[0-9]+}};
-  ; CHECK:       .param .align 16 .b8 retval0[20];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St4x5,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
-  ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0];
-  ; CHECK:       ld.param.b32    {{%r[0-9]+}},  [retval0+16];
-  %call = tail call fastcc [5 x i32] @callee_St4x5(%struct.St4x5* noundef nonnull byval(%struct.St4x5) align 4 %in)
-  %.fca.0.extract = extractvalue [5 x i32] %call, 0
-  %.fca.1.extract = extractvalue [5 x i32] %call, 1
-  %.fca.2.extract = extractvalue [5 x i32] %call, 2
-  %.fca.3.extract = extractvalue [5 x i32] %call, 3
-  %.fca.4.extract = extractvalue [5 x i32] %call, 4
-  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 0
-  store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4
-  %ref.tmp.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 1
-  store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx3, align 4
-  %ref.tmp.sroa.5.0..sroa_idx5 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 2
-  store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx5, align 4
-  %ref.tmp.sroa.6.0..sroa_idx7 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 3
-  store i32 %.fca.3.extract, i32* %ref.tmp.sroa.6.0..sroa_idx7, align 4
-  %ref.tmp.sroa.7.0..sroa_idx9 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 4
-  store i32 %.fca.4.extract, i32* %ref.tmp.sroa.7.0..sroa_idx9, align 4
-  ret void
-}
-
-
-define internal fastcc [5 x i32] @callee_St4x5(%struct.St4x5* nocapture noundef readonly byval(%struct.St4x5) align 4 %in) {
-  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[20])
-  ; CHECK-LABEL: callee_St4x5(
-  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x5_param_0[20]
-  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x5_param_0];
-  ; CHECK:       ld.param.u32    [[R5:%r[0-9]+]],   [callee_St4x5_param_0+16];
-  ; CHECK:       st.param.v4.b32 [func_retval0+0],  {[[R1]], [[R2]], [[R3]], [[R4]]};
-  ; CHECK:       st.param.b32    [func_retval0+16], [[R5]];
-  ; CHECK-NEXT:  ret;
-  %arrayidx = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx.1 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 1
-  %2 = load i32, i32* %arrayidx.1, align 4
-  %arrayidx.2 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 2
-  %3 = load i32, i32* %arrayidx.2, align 4
-  %arrayidx.3 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 3
-  %4 = load i32, i32* %arrayidx.3, align 4
-  %arrayidx.4 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 4
-  %5 = load i32, i32* %arrayidx.4, align 4
-  %6 = insertvalue [5 x i32] poison, i32 %1, 0
-  %7 = insertvalue [5 x i32] %6, i32 %2, 1
-  %8 = insertvalue [5 x i32] %7, i32 %3, 2
-  %9 = insertvalue [5 x i32] %8, i32 %4, 3
-  %oldret = insertvalue [5 x i32] %9, i32 %5, 4
-  ret [5 x i32] %oldret
-}
-
-
-define dso_local void @caller_St4x6(%struct.St4x6* nocapture noundef readonly byval(%struct.St4x6) align 4 %in, %struct.St4x6* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func caller_St4x6(
-  ; CHECK:               .param .align 4 .b8 caller_St4x6_param_0[24],
-  ; CHECK:               .param .b32 caller_St4x6_param_1
-  ; CHECK:       )
-  ; CHECK:       .param .align 16 .b8 param0[24];
-  ; CHECK:       st.param.v4.b32 [param0+0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
-  ; CHECK:       st.param.v2.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}};
-  ; CHECK:       .param .align 16 .b8 retval0[24];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St4x6,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
-  ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0];
-  ; CHECK:       ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16];
-  %call = tail call fastcc [6 x i32] @callee_St4x6(%struct.St4x6* noundef nonnull byval(%struct.St4x6) align 4 %in)
-  %.fca.0.extract = extractvalue [6 x i32] %call, 0
-  %.fca.1.extract = extractvalue [6 x i32] %call, 1
-  %.fca.2.extract = extractvalue [6 x i32] %call, 2
-  %.fca.3.extract = extractvalue [6 x i32] %call, 3
-  %.fca.4.extract = extractvalue [6 x i32] %call, 4
-  %.fca.5.extract = extractvalue [6 x i32] %call, 5
-  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 0
-  store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4
-  %ref.tmp.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 1
-  store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx2, align 4
-  %ref.tmp.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 2
-  store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx4, align 4
-  %ref.tmp.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 3
-  store i32 %.fca.3.extract, i32* %ref.tmp.sroa.6.0..sroa_idx6, align 4
-  %ref.tmp.sroa.7.0..sroa_idx8 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 4
-  store i32 %.fca.4.extract, i32* %ref.tmp.sroa.7.0..sroa_idx8, align 4
-  %ref.tmp.sroa.8.0..sroa_idx10 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 5
-  store i32 %.fca.5.extract, i32* %ref.tmp.sroa.8.0..sroa_idx10, align 4
-  ret void
-}
-
-
-define internal fastcc [6 x i32] @callee_St4x6(%struct.St4x6* nocapture noundef readonly byval(%struct.St4x6) align 4 %in) {
-  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[24])
-  ; CHECK-LABEL: callee_St4x6(
-  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x6_param_0[24]
-  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x6_param_0];
-  ; CHECK:       ld.param.v2.u32 {[[R5:%r[0-9]+]],  [[R6:%r[0-9]+]]}, [callee_St4x6_param_0+16];
-  ; CHECK:       st.param.v4.b32 [func_retval0+0],  {[[R1]], [[R2]], [[R3]], [[R4]]};
-  ; CHECK:       st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]};
-  ; CHECK-NEXT:  ret;
-  %arrayidx = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx.1 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 1
-  %2 = load i32, i32* %arrayidx.1, align 4
-  %arrayidx.2 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 2
-  %3 = load i32, i32* %arrayidx.2, align 4
-  %arrayidx.3 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 3
-  %4 = load i32, i32* %arrayidx.3, align 4
-  %arrayidx.4 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 4
-  %5 = load i32, i32* %arrayidx.4, align 4
-  %arrayidx.5 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 5
-  %6 = load i32, i32* %arrayidx.5, align 4
-  %7 = insertvalue [6 x i32] poison, i32 %1, 0
-  %8 = insertvalue [6 x i32] %7, i32 %2, 1
-  %9 = insertvalue [6 x i32] %8, i32 %3, 2
-  %10 = insertvalue [6 x i32] %9, i32 %4, 3
-  %11 = insertvalue [6 x i32] %10, i32 %5, 4
-  %oldret = insertvalue [6 x i32] %11, i32 %6, 5
-  ret [6 x i32] %oldret
-}
-
-
-define dso_local void @caller_St4x7(%struct.St4x7* nocapture noundef readonly byval(%struct.St4x7) align 4 %in, %struct.St4x7* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func caller_St4x7(
-  ; CHECK:               .param .align 4 .b8 caller_St4x7_param_0[28],
-  ; CHECK:               .param .b32 caller_St4x7_param_1
-  ; CHECK:       )
-  ; CHECK:       .param .align 16 .b8 param0[28];
-  ; CHECK:       st.param.v4.b32 [param0+0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
-  ; CHECK:       st.param.v2.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}};
-  ; CHECK:       st.param.b32    [param0+24], {{%r[0-9]+}};
-  ; CHECK:       .param .align 16 .b8 retval0[28];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St4x7,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
-  ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0];
-  ; CHECK:       ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16];
-  ; CHECK:       ld.param.b32    {{%r[0-9]+}}, [retval0+24];
-  %call = tail call fastcc [7 x i32] @callee_St4x7(%struct.St4x7* noundef nonnull byval(%struct.St4x7) align 4 %in)
-  %.fca.0.extract = extractvalue [7 x i32] %call, 0
-  %.fca.1.extract = extractvalue [7 x i32] %call, 1
-  %.fca.2.extract = extractvalue [7 x i32] %call, 2
-  %.fca.3.extract = extractvalue [7 x i32] %call, 3
-  %.fca.4.extract = extractvalue [7 x i32] %call, 4
-  %.fca.5.extract = extractvalue [7 x i32] %call, 5
-  %.fca.6.extract = extractvalue [7 x i32] %call, 6
-  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 0
-  store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4
-  %ref.tmp.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 1
-  store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx2, align 4
-  %ref.tmp.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 2
-  store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx4, align 4
-  %ref.tmp.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 3
-  store i32 %.fca.3.extract, i32* %ref.tmp.sroa.6.0..sroa_idx6, align 4
-  %ref.tmp.sroa.7.0..sroa_idx8 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 4
-  store i32 %.fca.4.extract, i32* %ref.tmp.sroa.7.0..sroa_idx8, align 4
-  %ref.tmp.sroa.8.0..sroa_idx10 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 5
-  store i32 %.fca.5.extract, i32* %ref.tmp.sroa.8.0..sroa_idx10, align 4
-  %ref.tmp.sroa.9.0..sroa_idx12 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 6
-  store i32 %.fca.6.extract, i32* %ref.tmp.sroa.9.0..sroa_idx12, align 4
-  ret void
-}
-
-
-define internal fastcc [7 x i32] @callee_St4x7(%struct.St4x7* nocapture noundef readonly byval(%struct.St4x7) align 4 %in) {
-  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[28])
-  ; CHECK-LABEL: callee_St4x7(
-  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x7_param_0[28]
-  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0];
-  ; CHECK:       ld.param.v2.u32 {[[R5:%r[0-9]+]],  [[R6:%r[0-9]+]]}, [callee_St4x7_param_0+16];
-  ; CHECK:       ld.param.u32    [[R7:%r[0-9]+]],   [callee_St4x7_param_0+24];
-  ; CHECK:       st.param.v4.b32 [func_retval0+0],  {[[R1]], [[R2]], [[R3]], [[R4]]};
-  ; CHECK:       st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]};
-  ; CHECK:       st.param.b32    [func_retval0+24], [[R7]];
-  ; CHECK-NEXT:  ret;
-  %arrayidx = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx.1 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 1
-  %2 = load i32, i32* %arrayidx.1, align 4
-  %arrayidx.2 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 2
-  %3 = load i32, i32* %arrayidx.2, align 4
-  %arrayidx.3 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 3
-  %4 = load i32, i32* %arrayidx.3, align 4
-  %arrayidx.4 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 4
-  %5 = load i32, i32* %arrayidx.4, align 4
-  %arrayidx.5 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 5
-  %6 = load i32, i32* %arrayidx.5, align 4
-  %arrayidx.6 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 6
-  %7 = load i32, i32* %arrayidx.6, align 4
-  %8 = insertvalue [7 x i32] poison, i32 %1, 0
-  %9 = insertvalue [7 x i32] %8, i32 %2, 1
-  %10 = insertvalue [7 x i32] %9, i32 %3, 2
-  %11 = insertvalue [7 x i32] %10, i32 %4, 3
-  %12 = insertvalue [7 x i32] %11, i32 %5, 4
-  %13 = insertvalue [7 x i32] %12, i32 %6, 5
-  %oldret = insertvalue [7 x i32] %13, i32 %7, 6
-  ret [7 x i32] %oldret
-}
-
-
-define dso_local void @caller_St4x8(%struct.St4x8* nocapture noundef readonly byval(%struct.St4x8) align 4 %in, %struct.St4x8* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func caller_St4x8(
-  ; CHECK:               .param .align 4 .b8 caller_St4x8_param_0[32],
-  ; CHECK:               .param .b32 caller_St4x8_param_1
-  ; CHECK:       )
-  ; CHECK:       .param .align 16 .b8 param0[32];
-  ; CHECK:       st.param.v4.b32 [param0+0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
-  ; CHECK:       st.param.v4.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
-  ; CHECK:       .param .align 16 .b8 retval0[32];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St4x8,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
-  ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0];
-  ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16];
-  %call = tail call fastcc [8 x i32] @callee_St4x8(%struct.St4x8* noundef nonnull byval(%struct.St4x8) align 4 %in)
-  %.fca.0.extract = extractvalue [8 x i32] %call, 0
-  %.fca.1.extract = extractvalue [8 x i32] %call, 1
-  %.fca.2.extract = extractvalue [8 x i32] %call, 2
-  %.fca.3.extract = extractvalue [8 x i32] %call, 3
-  %.fca.4.extract = extractvalue [8 x i32] %call, 4
-  %.fca.5.extract = extractvalue [8 x i32] %call, 5
-  %.fca.6.extract = extractvalue [8 x i32] %call, 6
-  %.fca.7.extract = extractvalue [8 x i32] %call, 7
-  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 0
-  store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4
-  %ref.tmp.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 1
-  store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx2, align 4
-  %ref.tmp.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 2
-  store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx4, align 4
-  %ref.tmp.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 3
-  store i32 %.fca.3.extract, i32* %ref.tmp.sroa.6.0..sroa_idx6, align 4
-  %ref.tmp.sroa.7.0..sroa_idx8 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 4
-  store i32 %.fca.4.extract, i32* %ref.tmp.sroa.7.0..sroa_idx8, align 4
-  %ref.tmp.sroa.8.0..sroa_idx10 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 5
-  store i32 %.fca.5.extract, i32* %ref.tmp.sroa.8.0..sroa_idx10, align 4
-  %ref.tmp.sroa.9.0..sroa_idx12 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 6
-  store i32 %.fca.6.extract, i32* %ref.tmp.sroa.9.0..sroa_idx12, align 4
-  %ref.tmp.sroa.10.0..sroa_idx14 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 7
-  store i32 %.fca.7.extract, i32* %ref.tmp.sroa.10.0..sroa_idx14, align 4
-  ret void
-}
-
-
-define internal fastcc [8 x i32] @callee_St4x8(%struct.St4x8* nocapture noundef readonly byval(%struct.St4x8) align 4 %in) {
-  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[32])
-  ; CHECK-LABEL: callee_St4x8(
-  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x8_param_0[32]
-  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x8_param_0];
-  ; CHECK:       ld.param.v4.u32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], [[R8:%r[0-9]+]]}, [callee_St4x8_param_0+16];
-  ; CHECK:       st.param.v4.b32 [func_retval0+0],  {[[R1]], [[R2]], [[R3]], [[R4]]};
-  ; CHECK:       st.param.v4.b32 [func_retval0+16], {[[R5]], [[R6]], [[R7]], [[R8]]};
-  ; CHECK-NEXT:  ret;
-  %arrayidx = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx.1 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 1
-  %2 = load i32, i32* %arrayidx.1, align 4
-  %arrayidx.2 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 2
-  %3 = load i32, i32* %arrayidx.2, align 4
-  %arrayidx.3 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 3
-  %4 = load i32, i32* %arrayidx.3, align 4
-  %arrayidx.4 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 4
-  %5 = load i32, i32* %arrayidx.4, align 4
-  %arrayidx.5 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 5
-  %6 = load i32, i32* %arrayidx.5, align 4
-  %arrayidx.6 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 6
-  %7 = load i32, i32* %arrayidx.6, align 4
-  %arrayidx.7 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 7
-  %8 = load i32, i32* %arrayidx.7, align 4
-  %9 = insertvalue [8 x i32] poison, i32 %1, 0
-  %10 = insertvalue [8 x i32] %9, i32 %2, 1
-  %11 = insertvalue [8 x i32] %10, i32 %3, 2
-  %12 = insertvalue [8 x i32] %11, i32 %4, 3
-  %13 = insertvalue [8 x i32] %12, i32 %5, 4
-  %14 = insertvalue [8 x i32] %13, i32 %6, 5
-  %15 = insertvalue [8 x i32] %14, i32 %7, 6
-  %oldret = insertvalue [8 x i32] %15, i32 %8, 7
-  ret [8 x i32] %oldret
-}
-
-
-define dso_local void @caller_St8x1(%struct.St8x1* nocapture noundef readonly byval(%struct.St8x1) align 8 %in, %struct.St8x1* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func caller_St8x1(
-  ; CHECK:               .param .align 8 .b8 caller_St8x1_param_0[8],
-  ; CHECK:               .param .b32 caller_St8x1_param_1
-  ; CHECK:       )
-  ; CHECK:       .param .b64 param0;
-  ; CHECK:       st.param.b64 [param0+0], {{%rd[0-9]+}};
-  ; CHECK:       .param .align 16 .b8 retval0[8];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St8x1,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
-  ; CHECK:       ld.param.b64 {{%rd[0-9]+}}, [retval0+0];
-  %1 = getelementptr inbounds %struct.St8x1, %struct.St8x1* %in, i64 0, i32 0, i64 0
-  %2 = load i64, i64* %1, align 8
-  %call = tail call fastcc [1 x i64] @callee_St8x1(i64 %2)
-  %.fca.0.extract = extractvalue [1 x i64] %call, 0
-  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St8x1, %struct.St8x1* %ret, i64 0, i32 0, i64 0
-  store i64 %.fca.0.extract, i64* %ref.tmp.sroa.0.0..sroa_idx, align 8
-  ret void
-}
-
-
-define internal fastcc [1 x i64] @callee_St8x1(i64 %in.0.val) {
-  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[8])
-  ; CHECK-LABEL: callee_St8x1(
-  ; CHECK-NEXT:  .param .b64 callee_St8x1_param_0
-  ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [callee_St8x1_param_0];
-  ; CHECK:       st.param.b64 [func_retval0+0],  [[RD1]];
-  ; CHECK-NEXT:  ret;
-  %oldret = insertvalue [1 x i64] poison, i64 %in.0.val, 0
-  ret [1 x i64] %oldret
-}
-
-
-define dso_local void @caller_St8x2(%struct.St8x2* nocapture noundef readonly byval(%struct.St8x2) align 8 %in, %struct.St8x2* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func caller_St8x2(
-  ; CHECK:               .param .align 8 .b8 caller_St8x2_param_0[16],
-  ; CHECK:               .param .b32 caller_St8x2_param_1
-  ; CHECK:       )
-  ; CHECK:       .param .align 16 .b8 param0[16];
-  ; CHECK:       st.param.v2.b64 [param0+0],  {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
-  ; CHECK:       .param .align 16 .b8 retval0[16];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St8x2,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
-  ; CHECK:       ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+0];
-  %call = tail call fastcc [2 x i64] @callee_St8x2(%struct.St8x2* noundef nonnull byval(%struct.St8x2) align 8 %in)
-  %.fca.0.extract = extractvalue [2 x i64] %call, 0
-  %.fca.1.extract = extractvalue [2 x i64] %call, 1
-  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St8x2, %struct.St8x2* %ret, i64 0, i32 0, i64 0
-  store i64 %.fca.0.extract, i64* %ref.tmp.sroa.0.0..sroa_idx, align 8
-  %ref.tmp.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.St8x2, %struct.St8x2* %ret, i64 0, i32 0, i64 1
-  store i64 %.fca.1.extract, i64* %ref.tmp.sroa.4.0..sroa_idx3, align 8
-  ret void
-}
-
-
-define internal fastcc [2 x i64] @callee_St8x2(%struct.St8x2* nocapture noundef readonly byval(%struct.St8x2) align 8 %in) {
-  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[16])
-  ; CHECK-LABEL: callee_St8x2(
-  ; CHECK-NEXT:  .param .align 16 .b8 callee_St8x2_param_0[16]
-  ; CHECK:       ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x2_param_0];
-  ; CHECK:       st.param.v2.b64 [func_retval0+0], {[[RD1]], [[RD2]]};
-  ; CHECK-NEXT:  ret;
-  %arrayidx = getelementptr inbounds %struct.St8x2, %struct.St8x2* %in, i64 0, i32 0, i64 0
-  %1 = load i64, i64* %arrayidx, align 8
-  %arrayidx.1 = getelementptr inbounds %struct.St8x2, %struct.St8x2* %in, i64 0, i32 0, i64 1
-  %2 = load i64, i64* %arrayidx.1, align 8
-  %3 = insertvalue [2 x i64] poison, i64 %1, 0
-  %oldret = insertvalue [2 x i64] %3, i64 %2, 1
-  ret [2 x i64] %oldret
-}
-
-
-define dso_local void @caller_St8x3(%struct.St8x3* nocapture noundef readonly byval(%struct.St8x3) align 8 %in, %struct.St8x3* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func caller_St8x3(
-  ; CHECK:               .param .align 8 .b8 caller_St8x3_param_0[24],
-  ; CHECK:               .param .b32 caller_St8x3_param_1
-  ; CHECK:       )
-  ; CHECK:       .param .align 16 .b8 param0[24];
-  ; CHECK:       st.param.v2.b64 [param0+0],  {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
-  ; CHECK:       st.param.b64    [param0+16], {{%rd[0-9]+}};
-  ; CHECK:       .param .align 16 .b8 retval0[24];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St8x3,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
-  ; CHECK:       ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+0];
-  ; CHECK:       ld.param.b64    {{%rd[0-9]+}}, [retval0+16];
-  %call = tail call fastcc [3 x i64] @callee_St8x3(%struct.St8x3* noundef nonnull byval(%struct.St8x3) align 8 %in)
-  %.fca.0.extract = extractvalue [3 x i64] %call, 0
-  %.fca.1.extract = extractvalue [3 x i64] %call, 1
-  %.fca.2.extract = extractvalue [3 x i64] %call, 2
-  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 0
-  store i64 %.fca.0.extract, i64* %ref.tmp.sroa.0.0..sroa_idx, align 8
-  %ref.tmp.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 1
-  store i64 %.fca.1.extract, i64* %ref.tmp.sroa.4.0..sroa_idx2, align 8
-  %ref.tmp.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 2
-  store i64 %.fca.2.extract, i64* %ref.tmp.sroa.5.0..sroa_idx4, align 8
-  ret void
-}
-
-
-define internal fastcc [3 x i64] @callee_St8x3(%struct.St8x3* nocapture noundef readonly byval(%struct.St8x3) align 8 %in) {
-  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[24])
-  ; CHECK-LABEL: callee_St8x3(
-  ; CHECK-NEXT:  .param .align 16 .b8 callee_St8x3_param_0[24]
-  ; CHECK:       ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x3_param_0];
-  ; CHECK:       ld.param.u64    [[RD3:%rd[0-9]+]],  [callee_St8x3_param_0+16];
-  ; CHECK:       st.param.v2.b64 [func_retval0+0],   {[[RD1]], [[RD2]]};
-  ; CHECK:       st.param.b64    [func_retval0+16],  [[RD3]];
-  ; CHECK-NEXT:  ret;
-  %arrayidx = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 0
-  %1 = load i64, i64* %arrayidx, align 8
-  %arrayidx.1 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 1
-  %2 = load i64, i64* %arrayidx.1, align 8
-  %arrayidx.2 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 2
-  %3 = load i64, i64* %arrayidx.2, align 8
-  %4 = insertvalue [3 x i64] poison, i64 %1, 0
-  %5 = insertvalue [3 x i64] %4, i64 %2, 1
-  %oldret = insertvalue [3 x i64] %5, i64 %3, 2
-  ret [3 x i64] %oldret
-}
-
-
-define dso_local void @caller_St8x4(%struct.St8x4* nocapture noundef readonly byval(%struct.St8x4) align 8 %in, %struct.St8x4* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func caller_St8x4(
-  ; CHECK:               .param .align 8 .b8 caller_St8x4_param_0[32],
-  ; CHECK:               .param .b32 caller_St8x4_param_1
-  ; CHECK:       )
-  ; CHECK:       .param .align 16 .b8 param0[32];
-  ; CHECK:       st.param.v2.b64 [param0+0],  {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
-  ; CHECK:       st.param.v2.b64 [param0+16], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
-  ; CHECK:       .param .align 16 .b8 retval0[32];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St8x4,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
-  ; CHECK:       ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+0];
-  ; CHECK:       ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+16];
-  %call = tail call fastcc [4 x i64] @callee_St8x4(%struct.St8x4* noundef nonnull byval(%struct.St8x4) align 8 %in)
-  %.fca.0.extract = extractvalue [4 x i64] %call, 0
-  %.fca.1.extract = extractvalue [4 x i64] %call, 1
-  %.fca.2.extract = extractvalue [4 x i64] %call, 2
-  %.fca.3.extract = extractvalue [4 x i64] %call, 3
-  %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 0
-  store i64 %.fca.0.extract, i64* %ref.tmp.sroa.0.0..sroa_idx, align 8
-  %ref.tmp.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 1
-  store i64 %.fca.1.extract, i64* %ref.tmp.sroa.4.0..sroa_idx3, align 8
-  %ref.tmp.sroa.5.0..sroa_idx5 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 2
-  store i64 %.fca.2.extract, i64* %ref.tmp.sroa.5.0..sroa_idx5, align 8
-  %ref.tmp.sroa.6.0..sroa_idx7 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 3
-  store i64 %.fca.3.extract, i64* %ref.tmp.sroa.6.0..sroa_idx7, align 8
-  ret void
-}
-
-
-define internal fastcc [4 x i64] @callee_St8x4(%struct.St8x4* nocapture noundef readonly byval(%struct.St8x4) align 8 %in) {
-  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[32])
-  ; CHECK-LABEL: callee_St8x4(
-  ; CHECK-NEXT:  .param .align 16 .b8 callee_St8x4_param_0[32]
-  ; CHECK:       ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x4_param_0];
-  ; CHECK:       ld.param.v2.u64 {[[RD3:%rd[0-9]+]], [[RD4:%rd[0-9]+]]}, [callee_St8x4_param_0+16];
-  ; CHECK:       st.param.v2.b64 [func_retval0+0],  {[[RD1]], [[RD2]]};
-  ; CHECK:       st.param.v2.b64 [func_retval0+16], {[[RD3]], [[RD4]]};
-  ; CHECK-NEXT:  ret;
-  %arrayidx = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 0
-  %1 = load i64, i64* %arrayidx, align 8
-  %arrayidx.1 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 1
-  %2 = load i64, i64* %arrayidx.1, align 8
-  %arrayidx.2 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 2
-  %3 = load i64, i64* %arrayidx.2, align 8
-  %arrayidx.3 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 3
-  %4 = load i64, i64* %arrayidx.3, align 8
-  %5 = insertvalue [4 x i64] poison, i64 %1, 0
-  %6 = insertvalue [4 x i64] %5, i64 %2, 1
-  %7 = insertvalue [4 x i64] %6, i64 %3, 2
-  %oldret = insertvalue [4 x i64] %7, i64 %4, 3
-  ret [4 x i64] %oldret
-}
-
-; Section 2 - checking that function argument (including retval) vectorization is done with private linkage.
-
-define private fastcc [4 x i32] @callee_St4x4_private(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in) {
-  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[16])
-  ; CHECK-LABEL: callee_St4x4_private(
-  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x4_private_param_0[16]
-  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_private_param_0];
-  ; CHECK:       st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]};
-  ; CHECK-NEXT:  ret;
-  %arrayidx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 1
-  %2 = load i32, i32* %arrayidx.1, align 4
-  %arrayidx.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 2
-  %3 = load i32, i32* %arrayidx.2, align 4
-  %arrayidx.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 3
-  %4 = load i32, i32* %arrayidx.3, align 4
-  %5 = insertvalue [4 x i32] poison, i32 %1, 0
-  %6 = insertvalue [4 x i32] %5, i32 %2, 1
-  %7 = insertvalue [4 x i32] %6, i32 %3, 2
-  %oldret = insertvalue [4 x i32] %7, i32 %4, 3
-  ret [4 x i32] %oldret
-}
-
-; Section 3 - checking that function argument (including retval) vectorization
-; is NOT done with linkage types other than internal and private.
-
-define external fastcc [4 x i32] @callee_St4x4_external(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in) {
-  ; CHECK:       .func  (.param .align 4 .b8 func_retval0[16])
-  ; CHECK-LABEL: callee_St4x4_external(
-  ; CHECK-NEXT:  .param .align 4 .b8 callee_St4x4_external_param_0[16]
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]],   [callee_St4x4_external_param_0];
-  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]],   [callee_St4x4_external_param_0+4];
-  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]],   [callee_St4x4_external_param_0+8];
-  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]],   [callee_St4x4_external_param_0+12];
-  ; CHECK:       st.param.b32 [func_retval0+0],  [[R1]];
-  ; CHECK:       st.param.b32 [func_retval0+4],  [[R2]];
-  ; CHECK:       st.param.b32 [func_retval0+8],  [[R3]];
-  ; CHECK:       st.param.b32 [func_retval0+12], [[R4]];
-  ; CHECK-NEXT:  ret;
-  %arrayidx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 1
-  %2 = load i32, i32* %arrayidx.1, align 4
-  %arrayidx.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 2
-  %3 = load i32, i32* %arrayidx.2, align 4
-  %arrayidx.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 3
-  %4 = load i32, i32* %arrayidx.3, align 4
-  %5 = insertvalue [4 x i32] poison, i32 %1, 0
-  %6 = insertvalue [4 x i32] %5, i32 %2, 1
-  %7 = insertvalue [4 x i32] %6, i32 %3, 2
-  %oldret = insertvalue [4 x i32] %7, i32 %4, 3
-  ret [4 x i32] %oldret
-}

diff  --git a/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll
deleted file mode 100644
index 4a1ed8f4dcda7..0000000000000
--- a/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll
+++ /dev/null
@@ -1,456 +0,0 @@
-; RUN: llc < %s -mtriple=nvptx-unknown-unknown | FileCheck %s
-;
-; Check that parameters of a __global__ (kernel) function do not get increased
-; alignment, and no additional vectorization is performed on loads/stores with
-; that parameters.
-;
-; Test IR is a minimized version of IR generated with the following command
-; from the source code below:
-; $ clang++ -O3 --cuda-gpu-arch=sm_35 -S -emit-llvm src.cu
-;
-; ----------------------------------------------------------------------------
-; #include <stdint.h>
-;
-; struct St4x1 { uint32_t field[1]; };
-; struct St4x2 { uint32_t field[2]; };
-; struct St4x3 { uint32_t field[3]; };
-; struct St4x4 { uint32_t field[4]; };
-; struct St4x5 { uint32_t field[5]; };
-; struct St4x6 { uint32_t field[6]; };
-; struct St4x7 { uint32_t field[7]; };
-; struct St4x8 { uint32_t field[8]; };
-; struct St8x1 { uint64_t field[1]; };
-; struct St8x2 { uint64_t field[2]; };
-; struct St8x3 { uint64_t field[3]; };
-; struct St8x4 { uint64_t field[4]; };
-;
-; #define DECLARE_FUNCTION(StName)                                    \
-; static __global__  __attribute__((noinline))                        \
-; void foo_##StName(struct StName in, struct StName* ret) {           \
-;   const unsigned size = sizeof(ret->field) / sizeof(*ret->field);   \
-;   for (unsigned i = 0; i != size; ++i)                              \
-;     ret->field[i] = in.field[i];                                    \
-; }                                                                   \
-;
-; DECLARE_FUNCTION(St4x1)
-; DECLARE_FUNCTION(St4x2)
-; DECLARE_FUNCTION(St4x3)
-; DECLARE_FUNCTION(St4x4)
-; DECLARE_FUNCTION(St4x5)
-; DECLARE_FUNCTION(St4x6)
-; DECLARE_FUNCTION(St4x7)
-; DECLARE_FUNCTION(St4x8)
-; DECLARE_FUNCTION(St8x1)
-; DECLARE_FUNCTION(St8x2)
-; DECLARE_FUNCTION(St8x3)
-; DECLARE_FUNCTION(St8x4)
-; ----------------------------------------------------------------------------
-
-%struct.St4x1 = type { [1 x i32] }
-%struct.St4x2 = type { [2 x i32] }
-%struct.St4x3 = type { [3 x i32] }
-%struct.St4x4 = type { [4 x i32] }
-%struct.St4x5 = type { [5 x i32] }
-%struct.St4x6 = type { [6 x i32] }
-%struct.St4x7 = type { [7 x i32] }
-%struct.St4x8 = type { [8 x i32] }
-%struct.St8x1 = type { [1 x i64] }
-%struct.St8x2 = type { [2 x i64] }
-%struct.St8x3 = type { [3 x i64] }
-%struct.St8x4 = type { [4 x i64] }
-
-define dso_local void @foo_St4x1(%struct.St4x1* nocapture noundef readonly byval(%struct.St4x1) align 4 %in, %struct.St4x1* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func foo_St4x1(
-  ; CHECK:               .param .align 4 .b8 foo_St4x1_param_0[4],
-  ; CHECK:               .param .b32 foo_St4x1_param_1
-  ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x1_param_1];
-  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x1_param_0];
-  ; CHECK:       st.u32  [[[R1]]], [[R2]];
-  ; CHECK:       ret;
-  %arrayidx = getelementptr inbounds %struct.St4x1, %struct.St4x1* %in, i64 0, i32 0, i64 0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx3 = getelementptr inbounds %struct.St4x1, %struct.St4x1* %ret, i64 0, i32 0, i64 0
-  store i32 %1, i32* %arrayidx3, align 4
-  ret void
-}
-
-define dso_local void @foo_St4x2(%struct.St4x2* nocapture noundef readonly byval(%struct.St4x2) align 4 %in, %struct.St4x2* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func foo_St4x2(
-  ; CHECK:               .param .align 4 .b8 foo_St4x2_param_0[8],
-  ; CHECK:               .param .b32 foo_St4x2_param_1
-  ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x2_param_1];
-  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x2_param_0];
-  ; CHECK:       st.u32  [[[R1]]], [[R2]];
-  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x2_param_0+4];
-  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
-  ; CHECK:       ret;
-  %arrayidx = getelementptr inbounds %struct.St4x2, %struct.St4x2* %in, i64 0, i32 0, i64 0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx3 = getelementptr inbounds %struct.St4x2, %struct.St4x2* %ret, i64 0, i32 0, i64 0
-  store i32 %1, i32* %arrayidx3, align 4
-  %arrayidx.1 = getelementptr inbounds %struct.St4x2, %struct.St4x2* %in, i64 0, i32 0, i64 1
-  %2 = load i32, i32* %arrayidx.1, align 4
-  %arrayidx3.1 = getelementptr inbounds %struct.St4x2, %struct.St4x2* %ret, i64 0, i32 0, i64 1
-  store i32 %2, i32* %arrayidx3.1, align 4
-  ret void
-}
-
-define dso_local void @foo_St4x3(%struct.St4x3* nocapture noundef readonly byval(%struct.St4x3) align 4 %in, %struct.St4x3* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func foo_St4x3(
-  ; CHECK:               .param .align 4 .b8 foo_St4x3_param_0[12],
-  ; CHECK:               .param .b32 foo_St4x3_param_1
-  ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x3_param_1];
-  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x3_param_0];
-  ; CHECK:       st.u32  [[[R1]]], [[R2]];
-  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x3_param_0+4];
-  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
-  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x3_param_0+8];
-  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
-  ; CHECK:       ret;
-  %arrayidx = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx3 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 0
-  store i32 %1, i32* %arrayidx3, align 4
-  %arrayidx.1 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 1
-  %2 = load i32, i32* %arrayidx.1, align 4
-  %arrayidx3.1 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 1
-  store i32 %2, i32* %arrayidx3.1, align 4
-  %arrayidx.2 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 2
-  %3 = load i32, i32* %arrayidx.2, align 4
-  %arrayidx3.2 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 2
-  store i32 %3, i32* %arrayidx3.2, align 4
-  ret void
-}
-
-define dso_local void @foo_St4x4(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in, %struct.St4x4* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func foo_St4x4(
-  ; CHECK:               .param .align 4 .b8 foo_St4x4_param_0[16],
-  ; CHECK:               .param .b32 foo_St4x4_param_1
-  ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x4_param_1];
-  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x4_param_0];
-  ; CHECK:       st.u32  [[[R1]]], [[R2]];
-  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x4_param_0+4];
-  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
-  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x4_param_0+8];
-  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
-  ; CHECK:       ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x4_param_0+12];
-  ; CHECK:       st.u32  [[[R1]]+12], [[R5]];
-  ; CHECK:       ret;
-  %arrayidx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 0
-  store i32 %1, i32* %arrayidx3, align 4
-  %arrayidx.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 1
-  %2 = load i32, i32* %arrayidx.1, align 4
-  %arrayidx3.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 1
-  store i32 %2, i32* %arrayidx3.1, align 4
-  %arrayidx.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 2
-  %3 = load i32, i32* %arrayidx.2, align 4
-  %arrayidx3.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 2
-  store i32 %3, i32* %arrayidx3.2, align 4
-  %arrayidx.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 3
-  %4 = load i32, i32* %arrayidx.3, align 4
-  %arrayidx3.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 3
-  store i32 %4, i32* %arrayidx3.3, align 4
-  ret void
-}
-
-define dso_local void @foo_St4x5(%struct.St4x5* nocapture noundef readonly byval(%struct.St4x5) align 4 %in, %struct.St4x5* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func foo_St4x5(
-  ; CHECK:               .param .align 4 .b8 foo_St4x5_param_0[20],
-  ; CHECK:               .param .b32 foo_St4x5_param_1
-  ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x5_param_1];
-  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x5_param_0];
-  ; CHECK:       st.u32  [[[R1]]], [[R2]];
-  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x5_param_0+4];
-  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
-  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x5_param_0+8];
-  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
-  ; CHECK:       ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x5_param_0+12];
-  ; CHECK:       st.u32  [[[R1]]+12], [[R5]];
-  ; CHECK:       ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x5_param_0+16];
-  ; CHECK:       st.u32  [[[R1]]+16], [[R6]];
-  ; CHECK:       ret;
-  %arrayidx = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx3 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 0
-  store i32 %1, i32* %arrayidx3, align 4
-  %arrayidx.1 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 1
-  %2 = load i32, i32* %arrayidx.1, align 4
-  %arrayidx3.1 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 1
-  store i32 %2, i32* %arrayidx3.1, align 4
-  %arrayidx.2 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 2
-  %3 = load i32, i32* %arrayidx.2, align 4
-  %arrayidx3.2 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 2
-  store i32 %3, i32* %arrayidx3.2, align 4
-  %arrayidx.3 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 3
-  %4 = load i32, i32* %arrayidx.3, align 4
-  %arrayidx3.3 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 3
-  store i32 %4, i32* %arrayidx3.3, align 4
-  %arrayidx.4 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 4
-  %5 = load i32, i32* %arrayidx.4, align 4
-  %arrayidx3.4 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 4
-  store i32 %5, i32* %arrayidx3.4, align 4
-  ret void
-}
-
-define dso_local void @foo_St4x6(%struct.St4x6* nocapture noundef readonly byval(%struct.St4x6) align 4 %in, %struct.St4x6* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func foo_St4x6(
-  ; CHECK:               .param .align 4 .b8 foo_St4x6_param_0[24],
-  ; CHECK:               .param .b32 foo_St4x6_param_1
-  ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x6_param_1];
-  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x6_param_0];
-  ; CHECK:       st.u32  [[[R1]]], [[R2]];
-  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x6_param_0+4];
-  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
-  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x6_param_0+8];
-  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
-  ; CHECK:       ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x6_param_0+12];
-  ; CHECK:       st.u32  [[[R1]]+12], [[R5]];
-  ; CHECK:       ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x6_param_0+16];
-  ; CHECK:       st.u32  [[[R1]]+16], [[R6]];
-  ; CHECK:       ld.param.u32 [[R7:%r[0-9]+]], [foo_St4x6_param_0+20];
-  ; CHECK:       st.u32  [[[R1]]+20], [[R7]];
-  ; CHECK:       ret;
-  %arrayidx = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx3 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 0
-  store i32 %1, i32* %arrayidx3, align 4
-  %arrayidx.1 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 1
-  %2 = load i32, i32* %arrayidx.1, align 4
-  %arrayidx3.1 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 1
-  store i32 %2, i32* %arrayidx3.1, align 4
-  %arrayidx.2 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 2
-  %3 = load i32, i32* %arrayidx.2, align 4
-  %arrayidx3.2 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 2
-  store i32 %3, i32* %arrayidx3.2, align 4
-  %arrayidx.3 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 3
-  %4 = load i32, i32* %arrayidx.3, align 4
-  %arrayidx3.3 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 3
-  store i32 %4, i32* %arrayidx3.3, align 4
-  %arrayidx.4 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 4
-  %5 = load i32, i32* %arrayidx.4, align 4
-  %arrayidx3.4 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 4
-  store i32 %5, i32* %arrayidx3.4, align 4
-  %arrayidx.5 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 5
-  %6 = load i32, i32* %arrayidx.5, align 4
-  %arrayidx3.5 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 5
-  store i32 %6, i32* %arrayidx3.5, align 4
-  ret void
-}
-
-define dso_local void @foo_St4x7(%struct.St4x7* nocapture noundef readonly byval(%struct.St4x7) align 4 %in, %struct.St4x7* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func foo_St4x7(
-  ; CHECK:               .param .align 4 .b8 foo_St4x7_param_0[28],
-  ; CHECK:               .param .b32 foo_St4x7_param_1
-  ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x7_param_1];
-  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x7_param_0];
-  ; CHECK:       st.u32  [[[R1]]], [[R2]];
-  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x7_param_0+4];
-  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
-  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x7_param_0+8];
-  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
-  ; CHECK:       ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x7_param_0+12];
-  ; CHECK:       st.u32  [[[R1]]+12], [[R5]];
-  ; CHECK:       ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x7_param_0+16];
-  ; CHECK:       st.u32  [[[R1]]+16], [[R6]];
-  ; CHECK:       ld.param.u32 [[R7:%r[0-9]+]], [foo_St4x7_param_0+20];
-  ; CHECK:       st.u32  [[[R1]]+20], [[R7]];
-  ; CHECK:       ld.param.u32 [[R8:%r[0-9]+]], [foo_St4x7_param_0+24];
-  ; CHECK:       st.u32  [[[R1]]+24], [[R8]];
-  ; CHECK:       ret;
-  %arrayidx = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx3 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 0
-  store i32 %1, i32* %arrayidx3, align 4
-  %arrayidx.1 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 1
-  %2 = load i32, i32* %arrayidx.1, align 4
-  %arrayidx3.1 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 1
-  store i32 %2, i32* %arrayidx3.1, align 4
-  %arrayidx.2 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 2
-  %3 = load i32, i32* %arrayidx.2, align 4
-  %arrayidx3.2 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 2
-  store i32 %3, i32* %arrayidx3.2, align 4
-  %arrayidx.3 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 3
-  %4 = load i32, i32* %arrayidx.3, align 4
-  %arrayidx3.3 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 3
-  store i32 %4, i32* %arrayidx3.3, align 4
-  %arrayidx.4 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 4
-  %5 = load i32, i32* %arrayidx.4, align 4
-  %arrayidx3.4 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 4
-  store i32 %5, i32* %arrayidx3.4, align 4
-  %arrayidx.5 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 5
-  %6 = load i32, i32* %arrayidx.5, align 4
-  %arrayidx3.5 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 5
-  store i32 %6, i32* %arrayidx3.5, align 4
-  %arrayidx.6 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 6
-  %7 = load i32, i32* %arrayidx.6, align 4
-  %arrayidx3.6 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 6
-  store i32 %7, i32* %arrayidx3.6, align 4
-  ret void
-}
-
-define dso_local void @foo_St4x8(%struct.St4x8* nocapture noundef readonly byval(%struct.St4x8) align 4 %in, %struct.St4x8* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func foo_St4x8(
-  ; CHECK:               .param .align 4 .b8 foo_St4x8_param_0[32],
-  ; CHECK:               .param .b32 foo_St4x8_param_1
-  ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x8_param_1];
-  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x8_param_0];
-  ; CHECK:       st.u32  [[[R1]]], [[R2]];
-  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x8_param_0+4];
-  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
-  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x8_param_0+8];
-  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
-  ; CHECK:       ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x8_param_0+12];
-  ; CHECK:       st.u32  [[[R1]]+12], [[R5]];
-  ; CHECK:       ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x8_param_0+16];
-  ; CHECK:       st.u32  [[[R1]]+16], [[R6]];
-  ; CHECK:       ld.param.u32 [[R7:%r[0-9]+]], [foo_St4x8_param_0+20];
-  ; CHECK:       st.u32  [[[R1]]+20], [[R7]];
-  ; CHECK:       ld.param.u32 [[R8:%r[0-9]+]], [foo_St4x8_param_0+24];
-  ; CHECK:       st.u32  [[[R1]]+24], [[R8]];
-  ; CHECK:       ld.param.u32 [[R9:%r[0-9]+]], [foo_St4x8_param_0+28];
-  ; CHECK:       st.u32  [[[R1]]+28], [[R9]];
-  ; CHECK:       ret;
-  %arrayidx = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx3 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 0
-  store i32 %1, i32* %arrayidx3, align 4
-  %arrayidx.1 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 1
-  %2 = load i32, i32* %arrayidx.1, align 4
-  %arrayidx3.1 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 1
-  store i32 %2, i32* %arrayidx3.1, align 4
-  %arrayidx.2 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 2
-  %3 = load i32, i32* %arrayidx.2, align 4
-  %arrayidx3.2 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 2
-  store i32 %3, i32* %arrayidx3.2, align 4
-  %arrayidx.3 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 3
-  %4 = load i32, i32* %arrayidx.3, align 4
-  %arrayidx3.3 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 3
-  store i32 %4, i32* %arrayidx3.3, align 4
-  %arrayidx.4 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 4
-  %5 = load i32, i32* %arrayidx.4, align 4
-  %arrayidx3.4 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 4
-  store i32 %5, i32* %arrayidx3.4, align 4
-  %arrayidx.5 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 5
-  %6 = load i32, i32* %arrayidx.5, align 4
-  %arrayidx3.5 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 5
-  store i32 %6, i32* %arrayidx3.5, align 4
-  %arrayidx.6 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 6
-  %7 = load i32, i32* %arrayidx.6, align 4
-  %arrayidx3.6 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 6
-  store i32 %7, i32* %arrayidx3.6, align 4
-  %arrayidx.7 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 7
-  %8 = load i32, i32* %arrayidx.7, align 4
-  %arrayidx3.7 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 7
-  store i32 %8, i32* %arrayidx3.7, align 4
-  ret void
-}
-
-define dso_local void @foo_St8x1(%struct.St8x1* nocapture noundef readonly byval(%struct.St8x1) align 8 %in, %struct.St8x1* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func foo_St8x1(
-  ; CHECK:               .param .align 8 .b8 foo_St8x1_param_0[8],
-  ; CHECK:               .param .b32 foo_St8x1_param_1
-  ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St8x1_param_1];
-  ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x1_param_0];
-  ; CHECK:       st.u64 [[[R1]]], [[RD1]];
-  ; CHECK:       ret;
-  %arrayidx = getelementptr inbounds %struct.St8x1, %struct.St8x1* %in, i64 0, i32 0, i64 0
-  %1 = load i64, i64* %arrayidx, align 8
-  %arrayidx3 = getelementptr inbounds %struct.St8x1, %struct.St8x1* %ret, i64 0, i32 0, i64 0
-  store i64 %1, i64* %arrayidx3, align 8
-  ret void
-}
-
-define dso_local void @foo_St8x2(%struct.St8x2* nocapture noundef readonly byval(%struct.St8x2) align 8 %in, %struct.St8x2* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func foo_St8x2(
-  ; CHECK:               .param .align 8 .b8 foo_St8x2_param_0[16],
-  ; CHECK:               .param .b32 foo_St8x2_param_1
-  ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St8x2_param_1];
-  ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x2_param_0];
-  ; CHECK:       st.u64 [[[R1]]], [[RD1]];
-  ; CHECK:       ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x2_param_0+8];
-  ; CHECK:       st.u64 [[[R1]]+8], [[RD2]];
-  ; CHECK:       ret;
-  %arrayidx = getelementptr inbounds %struct.St8x2, %struct.St8x2* %in, i64 0, i32 0, i64 0
-  %1 = load i64, i64* %arrayidx, align 8
-  %arrayidx3 = getelementptr inbounds %struct.St8x2, %struct.St8x2* %ret, i64 0, i32 0, i64 0
-  store i64 %1, i64* %arrayidx3, align 8
-  %arrayidx.1 = getelementptr inbounds %struct.St8x2, %struct.St8x2* %in, i64 0, i32 0, i64 1
-  %2 = load i64, i64* %arrayidx.1, align 8
-  %arrayidx3.1 = getelementptr inbounds %struct.St8x2, %struct.St8x2* %ret, i64 0, i32 0, i64 1
-  store i64 %2, i64* %arrayidx3.1, align 8
-  ret void
-}
-
-define dso_local void @foo_St8x3(%struct.St8x3* nocapture noundef readonly byval(%struct.St8x3) align 8 %in, %struct.St8x3* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func foo_St8x3(
-  ; CHECK:               .param .align 8 .b8 foo_St8x3_param_0[24],
-  ; CHECK:               .param .b32 foo_St8x3_param_1
-  ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St8x3_param_1];
-  ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x3_param_0];
-  ; CHECK:       st.u64 [[[R1]]], [[RD1]];
-  ; CHECK:       ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x3_param_0+8];
-  ; CHECK:       st.u64 [[[R1]]+8], [[RD2]];
-  ; CHECK:       ld.param.u64 [[RD3:%rd[0-9]+]], [foo_St8x3_param_0+16];
-  ; CHECK:       st.u64 [[[R1]]+16], [[RD3]];
-  ; CHECK:       ret;
-  %arrayidx = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 0
-  %1 = load i64, i64* %arrayidx, align 8
-  %arrayidx3 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 0
-  store i64 %1, i64* %arrayidx3, align 8
-  %arrayidx.1 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 1
-  %2 = load i64, i64* %arrayidx.1, align 8
-  %arrayidx3.1 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 1
-  store i64 %2, i64* %arrayidx3.1, align 8
-  %arrayidx.2 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 2
-  %3 = load i64, i64* %arrayidx.2, align 8
-  %arrayidx3.2 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 2
-  store i64 %3, i64* %arrayidx3.2, align 8
-  ret void
-}
-
-define dso_local void @foo_St8x4(%struct.St8x4* nocapture noundef readonly byval(%struct.St8x4) align 8 %in, %struct.St8x4* nocapture noundef writeonly %ret) {
-  ; CHECK-LABEL: .visible .func foo_St8x4(
-  ; CHECK:               .param .align 8 .b8 foo_St8x4_param_0[32],
-  ; CHECK:               .param .b32 foo_St8x4_param_1
-  ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St8x4_param_1];
-  ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x4_param_0];
-  ; CHECK:       st.u64 [[[R1]]], [[RD1]];
-  ; CHECK:       ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x4_param_0+8];
-  ; CHECK:       st.u64 [[[R1]]+8], [[RD2]];
-  ; CHECK:       ld.param.u64 [[RD3:%rd[0-9]+]], [foo_St8x4_param_0+16];
-  ; CHECK:       st.u64 [[[R1]]+16], [[RD3]];
-  ; CHECK:       ld.param.u64 [[RD4:%rd[0-9]+]], [foo_St8x4_param_0+24];
-  ; CHECK:       st.u64 [[[R1]]+24], [[RD4]];
-  ; CHECK:       ret;
-  %arrayidx = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 0
-  %1 = load i64, i64* %arrayidx, align 8
-  %arrayidx3 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 0
-  store i64 %1, i64* %arrayidx3, align 8
-  %arrayidx.1 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 1
-  %2 = load i64, i64* %arrayidx.1, align 8
-  %arrayidx3.1 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 1
-  store i64 %2, i64* %arrayidx3.1, align 8
-  %arrayidx.2 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 2
-  %3 = load i64, i64* %arrayidx.2, align 8
-  %arrayidx3.2 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 2
-  store i64 %3, i64* %arrayidx3.2, align 8
-  %arrayidx.3 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 3
-  %4 = load i64, i64* %arrayidx.3, align 8
-  %arrayidx3.3 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 3
-  store i64 %4, i64* %arrayidx3.3, align 8
-  ret void
-}


        


More information about the cfe-commits mailing list